05f1a5a31e
Add watchdog mechanism to swss service and generate alert when swss have issue. **Work item tracking** Microsoft ADO (number only): 16578912 **What I did** Add orchagent watchdog to monitor and alert orchagent stuck issue. **Why I did it** Currently SONiC monit system only monit orchagent process exist or not. If orchagent process stuck and stop processing, current monit can't find and report it. **How I verified it** Pass all UT. Manually test process_monitoring/test_critical_process_monitoring.py can pass. Add new UT https://github.com/sonic-net/sonic-mgmt/pull/8306 to check watchdog works correctly. Manually test, after pause orchagent with 'kill -STOP <pid>', check there are warning message exist in log: Apr 28 23:36:41.504923 vlab-01 ERR swss#supervisor-proc-watchdog-listener: Process 'orchagent' is stuck in namespace 'host' (1.0 minutes). **Details if related** Heartbeat message PR: https://github.com/sonic-net/sonic-swss/pull/2737 UT PR: https://github.com/sonic-net/sonic-mgmt/pull/8306
292 lines
7.2 KiB
Django/Jinja
292 lines
7.2 KiB
Django/Jinja
[supervisord]
|
|
logfile_maxbytes=1MB
|
|
logfile_backups=2
|
|
nodaemon=true
|
|
|
|
[eventlistener:dependent-startup]
|
|
command=python3 -m supervisord_dependent_startup
|
|
autostart=true
|
|
autorestart=unexpected
|
|
startretries=0
|
|
exitcodes=0,3
|
|
events=PROCESS_STATE
|
|
buffer_size=1024
|
|
|
|
[eventlistener:supervisor-proc-exit-listener]
|
|
command=/usr/bin/supervisor-proc-exit-listener --container-name swss
|
|
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING,PROCESS_COMMUNICATION_STDOUT
|
|
autostart=true
|
|
autorestart=unexpected
|
|
buffer_size=1024
|
|
|
|
[program:rsyslogd]
|
|
command=/usr/sbin/rsyslogd -n -iNONE
|
|
priority=1
|
|
autostart=false
|
|
autorestart=unexpected
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
|
|
{% set is_fabric_asic = 0 %}
|
|
{% set orchagent_dependent_startup_wait_for = "portsyncd:running" %}
|
|
{% if DEVICE_METADATA.localhost.switch_type %}
|
|
{% if DEVICE_METADATA.localhost.switch_type == "fabric" %}
|
|
{% set is_fabric_asic = 1 %}
|
|
{% set orchagent_dependent_startup_wait_for = "rsyslogd:running" %}
|
|
{%- endif %}
|
|
{%- endif %}
|
|
{% set asan_extra_options = ':print_suppressions=0' %}
|
|
|
|
{% if is_fabric_asic == 0 %}
|
|
[program:gearsyncd]
|
|
command=/usr/bin/gearsyncd -p /usr/share/sonic/hwsku/gearbox_config.json
|
|
priority=3
|
|
autostart=false
|
|
autorestart=false
|
|
startsecs=0
|
|
startretries=0
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=rsyslogd:running
|
|
{% if ENABLE_ASAN == "y" %}
|
|
environment=ASAN_OPTIONS="log_path=/var/log/asan/gearsyncd-asan.log{{ asan_extra_options }}"
|
|
{% endif %}
|
|
{%- endif %}
|
|
|
|
{% if is_fabric_asic == 0 %}
|
|
[program:portsyncd]
|
|
command=/usr/bin/portsyncd
|
|
priority=3
|
|
autostart=false
|
|
autorestart=false
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=rsyslogd:running
|
|
{% if ENABLE_ASAN == "y" %}
|
|
environment=ASAN_OPTIONS="log_path=/var/log/asan/portsyncd-asan.log{{ asan_extra_options }}"
|
|
{% endif %}
|
|
{%- endif %}
|
|
|
|
[program:orchagent]
|
|
command=/usr/bin/orchagent.sh
|
|
priority=4
|
|
autostart=false
|
|
autorestart=false
|
|
stdout_capture_maxbytes=1MB
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for={{ orchagent_dependent_startup_wait_for }}
|
|
{% if ENABLE_ASAN == "y" %}
|
|
environment=ASAN_OPTIONS="log_path=/var/log/asan/orchagent-asan.log{{ asan_extra_options }}"
|
|
{% endif %}
|
|
|
|
[program:swssconfig]
|
|
command=/usr/bin/swssconfig.sh
|
|
priority=5
|
|
autostart=false
|
|
autorestart=unexpected
|
|
startretries=0
|
|
startsecs=0
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=orchagent:running
|
|
{% if ENABLE_ASAN == "y" %}
|
|
environment=ASAN_OPTIONS="log_path=/var/log/asan/swssconfig-asan.log{{ asan_extra_options }}"
|
|
{% endif %}
|
|
|
|
{% if is_fabric_asic == 0 %}
|
|
[program:restore_neighbors]
|
|
command=/usr/bin/restore_neighbors.py
|
|
priority=6
|
|
autostart=false
|
|
autorestart=false
|
|
startsecs=0
|
|
startretries=0
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=swssconfig:exited
|
|
{%- endif %}
|
|
|
|
{% if is_fabric_asic == 0 %}
|
|
[program:coppmgrd]
|
|
command=/usr/bin/coppmgrd
|
|
priority=6
|
|
autostart=false
|
|
autorestart=false
|
|
startretries=0
|
|
startsecs=0
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=orchagent:running
|
|
{% if ENABLE_ASAN == "y" %}
|
|
environment=ASAN_OPTIONS="log_path=/var/log/asan/coppmgrd-asan.log{{ asan_extra_options }}"
|
|
{% endif %}
|
|
{%- endif %}
|
|
|
|
{% if is_fabric_asic == 0 %}
|
|
[program:neighsyncd]
|
|
command=/usr/bin/neighsyncd
|
|
priority=7
|
|
autostart=false
|
|
autorestart=false
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=swssconfig:exited
|
|
{% if ENABLE_ASAN == "y" %}
|
|
environment=ASAN_OPTIONS="log_path=/var/log/asan/neighsyncd-asan.log{{ asan_extra_options }}"
|
|
{% endif %}
|
|
{%- endif %}
|
|
|
|
{% if is_fabric_asic == 0 %}
|
|
[program:vlanmgrd]
|
|
command=/usr/bin/vlanmgrd
|
|
priority=8
|
|
autostart=false
|
|
autorestart=false
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=swssconfig:exited
|
|
{% if ENABLE_ASAN == "y" %}
|
|
environment=ASAN_OPTIONS="log_path=/var/log/asan/vlanmgrd-asan.log{{ asan_extra_options }}"
|
|
{% endif %}
|
|
{%- endif %}
|
|
|
|
{% if is_fabric_asic == 0 %}
|
|
[program:intfmgrd]
|
|
command=/usr/bin/intfmgrd
|
|
priority=9
|
|
autostart=false
|
|
autorestart=false
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=swssconfig:exited
|
|
{% if ENABLE_ASAN == "y" %}
|
|
environment=ASAN_OPTIONS="log_path=/var/log/asan/intfmgrd-asan.log{{ asan_extra_options }}"
|
|
{% endif %}
|
|
{%- endif %}
|
|
|
|
{% if is_fabric_asic == 0 %}
|
|
[program:portmgrd]
|
|
command=/usr/bin/portmgrd
|
|
priority=10
|
|
autostart=false
|
|
autorestart=false
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=swssconfig:exited
|
|
{% if ENABLE_ASAN == "y" %}
|
|
environment=ASAN_OPTIONS="log_path=/var/log/asan/portmgrd-asan.log{{ asan_extra_options }}"
|
|
{% endif %}
|
|
{%- endif %}
|
|
|
|
{% if is_fabric_asic == 0 %}
|
|
[program:buffermgrd]
|
|
command=/usr/bin/buffermgrd.sh
|
|
priority=11
|
|
autostart=false
|
|
autorestart=false
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=swssconfig:exited
|
|
{% if ENABLE_ASAN == "y" %}
|
|
environment=ASAN_OPTIONS="log_path=/var/log/asan/buffermgrd-asan.log{{ asan_extra_options }}"
|
|
{% endif %}
|
|
{%- endif %}
|
|
|
|
{% if is_fabric_asic == 0 %}
|
|
[program:vrfmgrd]
|
|
command=/usr/bin/vrfmgrd
|
|
priority=13
|
|
autostart=false
|
|
autorestart=false
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=swssconfig:exited
|
|
{% if ENABLE_ASAN == "y" %}
|
|
environment=ASAN_OPTIONS="log_path=/var/log/asan/vrfmgrd-asan.log{{ asan_extra_options }}"
|
|
{% endif %}
|
|
{%- endif %}
|
|
|
|
{% if is_fabric_asic == 0 %}
|
|
[program:nbrmgrd]
|
|
command=/usr/bin/nbrmgrd
|
|
priority=15
|
|
autostart=false
|
|
autorestart=false
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=swssconfig:exited
|
|
{% if ENABLE_ASAN == "y" %}
|
|
environment=ASAN_OPTIONS="log_path=/var/log/asan/nbrmgrd-asan.log{{ asan_extra_options }}"
|
|
{% endif %}
|
|
{%- endif %}
|
|
|
|
{% if is_fabric_asic == 0 %}
|
|
[program:vxlanmgrd]
|
|
command=/usr/bin/vxlanmgrd
|
|
priority=16
|
|
autostart=false
|
|
autorestart=false
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=swssconfig:exited
|
|
{% if ENABLE_ASAN == "y" %}
|
|
environment=ASAN_OPTIONS="log_path=/var/log/asan/vxlanmgrd-asan.log{{ asan_extra_options }}"
|
|
{% endif %}
|
|
{%- endif %}
|
|
|
|
{% if is_fabric_asic == 0 %}
|
|
[program:tunnelmgrd]
|
|
command=/usr/bin/tunnelmgrd
|
|
priority=17
|
|
autostart=false
|
|
autorestart=false
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=swssconfig:exited
|
|
{% if ENABLE_ASAN == "y" %}
|
|
environment=ASAN_OPTIONS="log_path=/var/log/asan/tunnelmgrd-asan.log{{ asan_extra_options }}"
|
|
{% endif %}
|
|
{%- endif %}
|
|
|
|
[program:enable_counters]
|
|
command=/usr/bin/enable_counters.py
|
|
priority=12
|
|
autostart=false
|
|
autorestart=false
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=swssconfig:exited
|
|
|
|
{% if is_fabric_asic == 0 %}
|
|
[program:fdbsyncd]
|
|
command=/usr/bin/fdbsyncd
|
|
priority=17
|
|
autostart=false
|
|
autorestart=false
|
|
stdout_logfile=syslog
|
|
stderr_logfile=syslog
|
|
dependent_startup=true
|
|
dependent_startup_wait_for=swssconfig:exited
|
|
{% if ENABLE_ASAN == "y" %}
|
|
environment=ASAN_OPTIONS="log_path=/var/log/asan/fdbsyncd-asan.log{{ asan_extra_options }}"
|
|
{% endif %}
|
|
{%- endif %}
|