Issue
Multicast communication dies after a few minutes. OnApp Integrated Storage SAN nodes disappear:
>> Hardware Juniper
Switch 0 = EX4550
Switch 1 = EX4200
# onappstore nodes (Hypervisor can only see itself)
Node: 1315125913
status: ACTIVE
IP addr: 10.200.2.1
Role: BACKEND node
Node: 2960017830
status: ACTIVE
IP addr: 10.200.2.254
Role: FRONTEND node
Troubleshooting
Multicast debugging using omping:
[root@192.168.10.10 ~]# netstat -g
IPv6/IPv4 Group Memberships
Interface RefCnt Group
--------------- ------ ---------------------
lo 1 224.0.0.1
MGT 1 224.0.0.251
MGT 1 224.0.0.1
onappstore 1 224.3.28.2
onappstore 1 224.0.0.251
onappstore 1 224.0.0.1
1. Use omping to verify that multicast traffic is being allowed between the nodes:
[root@192.168.10.10 ~]# ./omping 10.200.2.254 10.200.3.254 10.200.4.254 10.200.5.254 10.200.1.254
10.200.2.254 : joined (S,G) = (*, 232.43.211.234), pinging
10.200.2.254 : unicast, seq=1, size=69 bytes, dist=0, time=0.063ms
10.200.2.254 : multicast, seq=1, size=69 bytes, dist=0, time=0.083ms
10.200.2.254 : unicast, seq=2, size=69 bytes, dist=0, time=0.076ms
10.200.2.254 : multicast, seq=2, size=69 bytes, dist=0, time=0.086ms
10.200.2.254 : multicast, seq=3, size=69 bytes, dist=0, time=0.091ms
10.200.3.254 : joined (S,G) = (*, 232.43.211.234), pinging
[...]
10.200.2.254 : unicast, seq=5, size=69 bytes, dist=0, time=0.099ms
10.200.2.254 : multicast, seq=5, size=69 bytes, dist=0, time=0.107ms
10.200.3.254 : unicast, seq=3, size=69 bytes, dist=0, time=0.076ms
10.200.3.254 : multicast, seq=3, size=69 bytes, dist=0, time=0.106ms
10.200.4.254 : joined (S,G) = (*, 232.43.211.234), pinging
10.200.5.254 : joined (S,G) = (*, 232.43.211.234), pinging
[root@192.168.10.11 ~]# ./omping 10.200.1.254 10.200.3.254 10.200.4.254 10.200.5.254 10.200.2.254
10.200.1.254 : joined (S,G) = (*, 232.43.211.234), pinging
10.200.1.254 : unicast, seq=1, size=69 bytes, dist=0, time=0.085ms
10.200.1.254 : unicast, seq=2, size=69 bytes, dist=0, time=0.095ms
10.200.1.254 : multicast, seq=2, size=69 bytes, dist=0, time=0.103ms
10.200.1.254 : unicast, seq=3, size=69 bytes, dist=0, time=0.098ms
10.200.1.254 : multicast, seq=3, size=69 bytes, dist=0, time=0.103ms
10.200.3.254 : joined (S,G) = (*, 232.43.211.234), pinging
10.200.3.254 : unicast, seq=1, size=69 bytes, dist=0, time=0.058ms
10.200.3.254 : multicast, seq=1, size=69 bytes, dist=0, time=0.077ms
[...]
10.200.1.254 : multicast, seq=4, size=69 bytes, dist=0, time=0.102ms
10.200.3.254 : unicast, seq=3, size=69 bytes, dist=0, time=0.074ms
10.200.3.254 : multicast, seq=3, size=69 bytes, dist=0, time=0.079ms
10.200.1.254 : unicast, seq=5, size=69 bytes, dist=0, time=0.095ms
10.200.1.254 : multicast, seq=5, size=69 bytes, dist=0, time=0.097ms
10.200.4.254 : joined (S,G) = (*, 232.43.211.234), pinging
[...]
10.200.5.254 : waiting for response msg
10.200.5.254 : joined (S,G) = (*, 232.43.211.234), pinging
10.200.5.254 : unicast, seq=1, size=69 bytes, dist=0, time=0.094ms
10.200.5.254 : multicast, seq=1, size=69 bytes, dist=0, time=0.097ms
[….]
[root@192.168.10.12 ~]# ./omping 10.200.1.254 10.200.2.254 10.200.4.254 10.200.5.254 10.200.3.254
10.200.2.254 : joined (S,G) = (*, 232.43.211.234), pinging
10.200.1.254 : joined (S,G) = (*, 232.43.211.234), pinging
10.200.4.254 : waiting for response msg
10.200.5.254 : waiting for response msg
10.200.1.254 : unicast, seq=3, size=69 bytes, dist=0, time=0.090ms
10.200.1.254 : multicast, seq=3, size=69 bytes, dist=0, time=0.099ms
10.200.2.254 : unicast, seq=3, size=69 bytes, dist=0, time=0.082ms
10.200.2.254 : multicast, seq=3, size=69 bytes, dist=0, time=0.091ms
10.200.4.254 : joined (S,G) = (*, 232.43.211.234), pinging
10.200.5.254 : joined (S,G) = (*, 232.43.211.234), pinging
10.200.2.254 : multicast, seq=5, size=69 bytes, dist=0, time=0.083ms
[root@192.168.10.13 ~]# ./omping 10.200.1.254 10.200.2.254 10.200.3.254 10.200.5.254 10.200.4.254
10.200.1.254 : waiting for response msg
10.200.2.254 : waiting for response msg
10.200.3.254 : waiting for response msg
10.200.5.254 : waiting for response msg
10.200.3.254 : joined (S,G) = (*, 232.43.211.234), pinging
10.200.1.254 : joined (S,G) = (*, 232.43.211.234), pinging
10.200.2.254 : joined (S,G) = (*, 232.43.211.234), pinging
10.200.3.254 : unicast, seq=1, size=69 bytes, dist=0, time=0.066ms
10.200.2.254 : unicast, seq=1, size=69 bytes, dist=0, time=0.079ms
10.200.1.254 : unicast, seq=1, size=69 bytes, dist=0, time=0.101ms
10.200.2.254 : unicast, seq=2, size=69 bytes, dist=0, time=0.092ms
10.200.3.254 : multicast, seq=2, size=69 bytes, dist=0, time=0.090ms
10.200.1.254 : unicast, seq=2, size=69 bytes, dist=0, time=0.129ms
10.200.1.254 : multicast, seq=2, size=69 bytes, dist=0, time=0.131ms
10.200.3.254 : unicast, seq=2, size=69 bytes, dist=0, time=0.114ms
10.200.2.254 : multicast, seq=2, size=69 bytes, dist=0, time=0.128ms
10.200.5.254 : joined (S,G) = (*, 232.43.211.234), pinging
root@192.168.10.14 ~]# ./omping 10.200.1.254 10.200.2.254 10.200.3.254 10.200.4.254 10.200.5.254
10.200.1.254 : waiting for response msg
10.200.2.254 : waiting for response msg
10.200.3.254 : waiting for response msg
10.200.4.254 : waiting for response msg
10.200.1.254 : joined (S,G) = (*, 232.43.211.234), pinging
10.200.2.254 : joined (S,G) = (*, 232.43.211.234), pinging
10.200.3.254 : joined (S,G) = (*, 232.43.211.234), pinging
10.200.4.254 : joined (S,G) = (*, 232.43.211.234), pinging
2. After about 3 minutes multicast traffic stops:
10.200.5.254 : unicast, seq=340, size=69 bytes, dist=0, time=0.101ms
10.200.1.254 : unicast, seq=358, size=69 bytes, dist=0, time=0.098ms
10.200.2.254 : unicast, seq=358, size=69 bytes, dist=0, time=0.092ms
10.200.4.254 : unicast, seq=350, size=69 bytes, dist=0, time=0.093ms
10.200.5.254 : unicast, seq=341, size=69 bytes, dist=0, time=0.109ms
10.200.1.254 : unicast, seq=359, size=69 bytes, dist=0, time=0.121ms
10.200.2.254 : unicast, seq=359, size=69 bytes, dist=0, time=0.152ms
10.200.4.254 : unicast, seq=351, size=69 bytes, dist=0, time=0.127ms
10.200.5.254 : unicast, seq=342, size=69 bytes, dist=0, time=0.120ms
Resolution
After looking at the configuration on the switch:
igmp-snooping was enabled for all VLANs on the switch, but igmp was not enabled on the interfaces.
Disable IGMP snooping and enable igmp. This is what was added to the switch.
igmp {
interface all;
}
Once multicast communication is allowed, OnApp Integrated Storage SAN are back online as "ACTIVE".
#onappstore nodes
ode: 2558877569
status: ACTIVE
IP addr: 10.200.3.1
Role: BACKEND node
Node: 1716643846
status: ACTIVE
IP addr: 10.200.1.254
Role: FRONTEND node
Node: 3644143738
status: ACTIVE
IP addr: 10.200.2.1
Role: BACKEND node
Node: 2936708140
status: ACTIVE
IP addr: 10.200.3.1
Role: BACKEND node
Node: 65303284
status: ACTIVE
IP addr: 10.200.3.254
Role: FRONTEND node
Node: 1315125913
status: ACTIVE
IP addr: 10.200.2.1
Role: BACKEND node
Node: 403195299
status: ACTIVE
IP addr: 10.200.4.254
Role: FRONTEND node
Node: 3690897618
status: ACTIVE
IP addr: 10.200.5.254
Role: FRONTEND node
Node: 2960017830
status: ACTIVE
IP addr: 10.200.2.254
Role: FRONTEND node