a8f8ea3b50
arp_update should resolve the missing arp/ndp static route entries. Added code to check for missing entries and try ping to resolve the missing entry. Why I did it Fixes #14179 chassis-packet: missing arp entries for static routes causing high orchagent cpu usage It is observed that some sonic-mgmt test case calls sonic-clear arp, which clears the static arp entries as well. Orchagent or arp_update process does not try to resolve the missing arp entries after clear. How I did it arp_update should resolve the missing arp/ndp static route entries. Added code to check for missing entries and try ping if any found to resolve it. How to verify it After boot or config reload, check ipv4 and ipv4 neigh entries to make sure all static route entries are present manual validation: Use sonic-clear arp and sonic-clear ndp to clear all neighbor entries run arp_update Check for neigh entries. All entries should be present. Signed-off-by: anamehra <anamehra@cisco.com>
146 lines
7.0 KiB
Bash
Executable File
146 lines
7.0 KiB
Bash
Executable File
#!/bin/bash
|
|
#
|
|
# usage:
|
|
# arp_update:
|
|
# Send ipv6 multicast pings to all "UP" L3 interfaces including vlan interfaces to
|
|
# refresh link-local addresses from neighbors.
|
|
# Send gratuitous ARP/NDP requests to VLAN member neighbors to refresh
|
|
# the ipv4/ipv6 neighbors state.
|
|
|
|
ARP_UPDATE_VARS_FILE="/usr/share/sonic/templates/arp_update_vars.j2"
|
|
|
|
while /bin/true; do
|
|
# find L3 interfaces which are UP, send ipv6 multicast pings
|
|
ARP_UPDATE_VARS=$(sonic-cfggen -d -t ${ARP_UPDATE_VARS_FILE})
|
|
SWITCH_TYPE=$(echo $ARP_UPDATE_VARS | jq -r '.switch_type')
|
|
if [[ "$SWITCH_TYPE" == "chassis-packet" ]]; then
|
|
# Get array of Nexthops and ifnames. Nexthops and ifnames are mapped one to one
|
|
STATIC_ROUTE_NEXTHOPS=($(echo $ARP_UPDATE_VARS | jq -r '.static_route_nexthops'))
|
|
STATIC_ROUTE_IFNAMES=($(echo $ARP_UPDATE_VARS | jq -r '.static_route_ifnames'))
|
|
# on supervisor/rp exit the script gracefully
|
|
if [[ -z "$STATIC_ROUTE_NEXTHOPS" ]] || [[ -z "$STATIC_ROUTE_IFNAMES" ]]; then
|
|
logger "arp_update: exiting as no static route in packet based chassis"
|
|
exit 0
|
|
fi
|
|
for i in ${!STATIC_ROUTE_NEXTHOPS[@]}; do
|
|
nexthop="${STATIC_ROUTE_NEXTHOPS[i]}"
|
|
if [[ $nexthop == *"."* ]]; then
|
|
neigh_state=( $(ip -4 neigh show | grep -w $nexthop | tr -s ' ' | cut -d ' ' -f 3,4) )
|
|
ping_prefix=ping
|
|
elif [[ $nexthop == *":"* ]] ; then
|
|
neigh_state=( $(ip -6 neigh show | grep -w $nexthop | tr -s ' ' | cut -d ' ' -f 3,4) )
|
|
ping_prefix=ping6
|
|
fi
|
|
if [[ -z "${neigh_state}" ]] || [[ "${neigh_state[1]}" == "INCOMPLETE" ]] || [[ "${neigh_state[1]}" == "FAILED" ]]; then
|
|
interface="${STATIC_ROUTE_IFNAMES[i]}"
|
|
if [[ -z "$interface" ]]; then
|
|
# should never be here, handling just in case
|
|
logger "ERR: arp_update: missing interface entry for static route $nexthop"
|
|
interface=${neigh_state[0]}
|
|
fi
|
|
intf_up=$(ip link show $interface | grep "state UP")
|
|
if [[ -n "$intf_up" ]]; then
|
|
pingcmd="timeout 0.2 $ping_prefix -I ${interface} -n -q -i 0 -c 1 -W 1 $nexthop >/dev/null"
|
|
eval $pingcmd
|
|
logger "arp_update: static route nexthop not resolved, pinging $nexthop on ${neigh_state[0]}"
|
|
fi
|
|
fi
|
|
done
|
|
|
|
sleep 300
|
|
continue
|
|
fi
|
|
# find L3 interfaces which are UP, send ipv6 multicast pings
|
|
INTERFACE=$(echo $ARP_UPDATE_VARS | jq -r '.interface')
|
|
PC_INTERFACE=$(echo $ARP_UPDATE_VARS | jq -r '.pc_interface')
|
|
VLAN_SUB_INTERFACE=$(echo $ARP_UPDATE_VARS | jq -r '.vlan_sub_interface')
|
|
|
|
ALL_INTERFACE="$INTERFACE $PC_INTERFACE $VLAN_SUB_INTERFACE"
|
|
for intf in $ALL_INTERFACE; do
|
|
ping6cmd="ping6 -I $intf -n -q -i 0 -c 1 -W 0 ff02::1 >/dev/null"
|
|
intf_up=$(ip link show $intf | grep "state UP")
|
|
if [[ -n "$intf_up" ]]; then
|
|
eval $ping6cmd
|
|
fi
|
|
done
|
|
|
|
VLAN=$(echo $ARP_UPDATE_VARS | jq -r '.vlan')
|
|
SUBTYPE=$(sonic-db-cli CONFIG_DB hget 'DEVICE_METADATA|localhost' 'subtype' | tr '[:upper:]' '[:lower:]')
|
|
for vlan in $VLAN; do
|
|
# generate a list of arping commands:
|
|
# arping -q -w 0 -c 1 -i <VLAN interface> <IP 1>;
|
|
# arping -q -w 0 -c 1 -i <VLAN interface> <IP 2>;
|
|
# ...
|
|
arpingcmd="sed -e 's/ / -i /' -e 's/^/arping -q -w 0 -c 1 /' -e 's/$/;/'"
|
|
ipcmd="ip -4 neigh show | grep $vlan | cut -d ' ' -f 1,3 | $arpingcmd"
|
|
|
|
eval `eval $ipcmd`
|
|
|
|
# send ipv6 multicast pings to Vlan interfaces to get/refresh link-local addrs
|
|
ping6cmd="timeout 1 ping6 -I $vlan -n -q -i 0 -c 1 -W 0 ff02::1 >/dev/null"
|
|
eval $ping6cmd
|
|
|
|
# generate a list of ndisc6 commands (exclude link-local addrs since it is done above):
|
|
# ndisc6 -q -w 0 -1 <IP 1> <VLAN interface>;
|
|
# ndisc6 -q -w 0 -1 <IP 2> <VLAN interface>;
|
|
# ...
|
|
ndisc6cmd="sed -e 's/^/ndisc6 -q -w 0 -1 /' -e 's/$/;/'"
|
|
ip6cmd="ip -6 neigh show | grep -v fe80 | grep $vlan | cut -d ' ' -f 1,3 | $ndisc6cmd"
|
|
eval `eval $ip6cmd`
|
|
|
|
if [[ $SUBTYPE == "dualtor" ]]; then
|
|
# manually set any remaining FAILED/INCOMPLETE entries to permanently INCOMPLETE
|
|
# this prevents any remaining INCOMPLETE entries from automatically transitioning to FAILED
|
|
# once these entries are incomplete, any subsequent neighbor advertisement messages
|
|
# are able to resolve the entry
|
|
|
|
# generates the following command for each failed or incomplete IPv6 neighbor
|
|
# ip neigh replace <neighbor IPv6> dev <VLAN name> nud incomplete
|
|
neigh_replace_template="sed -e 's/^/ip neigh replace /' -e 's/,/ dev /' -e 's/$/ nud incomplete;/'"
|
|
ip_neigh_replace_cmd="ip -6 neigh show | grep -v fe80 | grep $vlan | grep -E 'FAILED|INCOMPLETE' | cut -d ' ' -f 1,3 --output-delimiter=',' | $neigh_replace_template"
|
|
eval `eval $ip_neigh_replace_cmd`
|
|
|
|
# on dual ToR devices, try to resolve failed neighbor entries since
|
|
# these entries will have tunnel routes installed, preventing normal
|
|
# neighbor resolution (SWSS PR #2137)
|
|
|
|
# since ndisc6 is a userland process, the above ndisc6 commands are
|
|
# insufficient to update the kernel neighbor table for failed entries
|
|
|
|
# we don't need to do this for ipv4 neighbors since arping is able to
|
|
# update the kernel neighbor table
|
|
|
|
# generates the following command for each failed or incomplete IPv6 neighbor
|
|
# timeout 0.2 ping <neighbor IPv6> -n -q -i 0 -c 1 -W 1 -I <VLAN name> >/dev/null
|
|
ping6_template="sed -e 's/^/timeout 0.2 ping /' -e 's/,/ -n -q -i 0 -c 1 -W 1 -I /' -e 's/$/ >\/dev\/null;/'"
|
|
failed_ip6_neigh_cmd="ip -6 neigh show | grep -v fe80 | grep $vlan | grep -E 'FAILED|INCOMPLETE' | cut -d ' ' -f 1,3 --output-delimiter=',' | $ping6_template"
|
|
eval `eval $failed_ip6_neigh_cmd`
|
|
fi
|
|
done
|
|
|
|
|
|
# sleep here before handling the mismatch as it is not required during startup
|
|
sleep 300
|
|
|
|
# refresh neighbor entries from APP_DB in case of mismatch with kernel
|
|
DBNEIGH=$(sonic-db-cli APPL_DB keys NEIGH_TABLE*)
|
|
KERNEIGH4=$(ip -4 neigh show | grep Vlan | cut -d ' ' -f 1,3 --output-delimiter=',')
|
|
KERNEIGH6=$(ip -6 neigh show | grep -v fe80 | grep Vlan | cut -d ' ' -f 1,3 --output-delimiter=',')
|
|
for neigh in $DBNEIGH; do
|
|
intf="$( cut -d ':' -f 2 <<< "$neigh" )"
|
|
ip="$( cut -d ':' -f 3- <<< "$neigh" )"
|
|
if [[ $intf == *"Vlan"* ]]; then
|
|
if [[ $ip == *"."* ]] && [[ ! $KERNEIGH4 =~ "${ip},${intf}" ]]; then
|
|
pingcmd="timeout 0.2 ping -I $intf -n -q -i 0 -c 1 -W 1 $ip >/dev/null"
|
|
eval $pingcmd
|
|
logger "arp_update: mismatch arp entry, pinging ${ip} on ${intf}"
|
|
elif [[ $ip == *":"* ]] && [[ ! $KERNEIGH6 =~ "${ip},${intf}" ]]; then
|
|
ping6cmd="timeout 0.2 ping6 -I $intf -n -q -i 0 -c 1 -W 1 $ip >/dev/null"
|
|
eval $ping6cmd
|
|
logger "arp_update: mismatch v6 nbr entry, pinging ${ip} on ${intf}"
|
|
fi
|
|
fi
|
|
done
|
|
|
|
done
|