From 04ba6da1abb8d20a5ab6a8fbf04b91ad60116c91 Mon Sep 17 00:00:00 2001 From: Lawrence Lee Date: Fri, 5 Aug 2022 23:30:04 -0700 Subject: [PATCH] [202012][arp_update]: Resolve failed neighbors on dualtor (#11641) In arp_update, check for FAILED or INCOMPLETE kernel neighbor entries and manually ping them to try and resolve the neighbor Signed-off-by: Lawrence Lee --- files/scripts/arp_update | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/files/scripts/arp_update b/files/scripts/arp_update index e7f083e20d..f8704c582b 100755 --- a/files/scripts/arp_update +++ b/files/scripts/arp_update @@ -26,6 +26,7 @@ while /bin/true; do done VLAN=$(echo $ARP_UPDATE_VARS | jq -r '.vlan') + SUBTYPE=$(sonic-db-cli CONFIG_DB hget 'DEVICE_METADATA|localhost' 'subtype' | tr '[:upper:]' '[:lower:]') for vlan in $VLAN; do # generate a list of arping commands: # arping -q -w 0 -c 1 -i ; @@ -47,7 +48,26 @@ while /bin/true; do ndisc6cmd="sed -e 's/^/ndisc6 -q -w 0 -1 /' -e 's/$/;/'" ip6cmd="ip -6 neigh show | grep -v fe80 | grep $vlan | cut -d ' ' -f 1,3 | $ndisc6cmd" eval `eval $ip6cmd` + + if [[ $SUBTYPE == "dualtor" ]]; then + # on dual ToR devices, try to resolve failed neighbor entries since + # these entries will have tunnel routes installed, preventing normal + # neighbor resolution (SWSS PR #2137) + + # since ndisc6 is a userland process, the above ndisc6 commands are + # insufficient to update the kernel neighbor table for failed entries + + # we don't need to do this for ipv4 neighbors since arping is able to + # update the kernel neighbor table + + # generates the following command for each failed or incomplete IPv6 neighbor + # timeout 0.2 ping -n -q -i 0 -c 1 -W 1 -I >/dev/null + ping6_template="sed -e 's/^/timeout 0.2 ping /' -e 's/,/ -n -q -i 0 -c 1 -W 1 -I /' -e 's/$/ >\/dev\/null;/'" + failed_ip6_neigh_cmd="ip -6 neigh show | grep -v fe80 | grep $vlan | grep -E 'FAILED|INCOMPLETE' | cut -d ' ' -f 1,3 --output-delimiter=',' | $ping6_template" + eval `eval $failed_ip6_neigh_cmd` + fi done + # sleep here before handling the mismatch as it is not required during startup sleep 300