sonic-buildimage/dockers/docker-orchagent/supervisord.conf.j2
Hua Liu 05f1a5a31e
Add watchdog mechanism to swss service and generate alert when swss have issue. (#15429)
Add watchdog mechanism to swss service and generate alert when swss have issue. 

**Work item tracking**
Microsoft ADO (number only): 16578912

**What I did**
Add orchagent watchdog to monitor and alert orchagent stuck issue.

**Why I did it**
Currently SONiC monit system only monit orchagent process exist or not. If orchagent process stuck and stop processing, current monit can't find and report it.

**How I verified it**
Pass all UT.

Manually test process_monitoring/test_critical_process_monitoring.py can pass.

Add new UT https://github.com/sonic-net/sonic-mgmt/pull/8306 to check watchdog works correctly.

Manually test, after pause orchagent with 'kill -STOP <pid>', check there are warning message exist in log:

Apr 28 23:36:41.504923 vlab-01 ERR swss#supervisor-proc-watchdog-listener: Process 'orchagent' is stuck in namespace 'host' (1.0 minutes).

**Details if related**
Heartbeat message PR: https://github.com/sonic-net/sonic-swss/pull/2737
UT PR: https://github.com/sonic-net/sonic-mgmt/pull/8306
2023-06-12 17:53:54 -07:00

292 lines
7.2 KiB
Django/Jinja

[supervisord]
logfile_maxbytes=1MB
logfile_backups=2
nodaemon=true
[eventlistener:dependent-startup]
command=python3 -m supervisord_dependent_startup
autostart=true
autorestart=unexpected
startretries=0
exitcodes=0,3
events=PROCESS_STATE
buffer_size=1024
[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name swss
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING,PROCESS_COMMUNICATION_STDOUT
autostart=true
autorestart=unexpected
buffer_size=1024
[program:rsyslogd]
command=/usr/sbin/rsyslogd -n -iNONE
priority=1
autostart=false
autorestart=unexpected
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
{% set is_fabric_asic = 0 %}
{% set orchagent_dependent_startup_wait_for = "portsyncd:running" %}
{% if DEVICE_METADATA.localhost.switch_type %}
{% if DEVICE_METADATA.localhost.switch_type == "fabric" %}
{% set is_fabric_asic = 1 %}
{% set orchagent_dependent_startup_wait_for = "rsyslogd:running" %}
{%- endif %}
{%- endif %}
{% set asan_extra_options = ':print_suppressions=0' %}
{% if is_fabric_asic == 0 %}
[program:gearsyncd]
command=/usr/bin/gearsyncd -p /usr/share/sonic/hwsku/gearbox_config.json
priority=3
autostart=false
autorestart=false
startsecs=0
startretries=0
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=rsyslogd:running
{% if ENABLE_ASAN == "y" %}
environment=ASAN_OPTIONS="log_path=/var/log/asan/gearsyncd-asan.log{{ asan_extra_options }}"
{% endif %}
{%- endif %}
{% if is_fabric_asic == 0 %}
[program:portsyncd]
command=/usr/bin/portsyncd
priority=3
autostart=false
autorestart=false
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=rsyslogd:running
{% if ENABLE_ASAN == "y" %}
environment=ASAN_OPTIONS="log_path=/var/log/asan/portsyncd-asan.log{{ asan_extra_options }}"
{% endif %}
{%- endif %}
[program:orchagent]
command=/usr/bin/orchagent.sh
priority=4
autostart=false
autorestart=false
stdout_capture_maxbytes=1MB
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for={{ orchagent_dependent_startup_wait_for }}
{% if ENABLE_ASAN == "y" %}
environment=ASAN_OPTIONS="log_path=/var/log/asan/orchagent-asan.log{{ asan_extra_options }}"
{% endif %}
[program:swssconfig]
command=/usr/bin/swssconfig.sh
priority=5
autostart=false
autorestart=unexpected
startretries=0
startsecs=0
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=orchagent:running
{% if ENABLE_ASAN == "y" %}
environment=ASAN_OPTIONS="log_path=/var/log/asan/swssconfig-asan.log{{ asan_extra_options }}"
{% endif %}
{% if is_fabric_asic == 0 %}
[program:restore_neighbors]
command=/usr/bin/restore_neighbors.py
priority=6
autostart=false
autorestart=false
startsecs=0
startretries=0
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=swssconfig:exited
{%- endif %}
{% if is_fabric_asic == 0 %}
[program:coppmgrd]
command=/usr/bin/coppmgrd
priority=6
autostart=false
autorestart=false
startretries=0
startsecs=0
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=orchagent:running
{% if ENABLE_ASAN == "y" %}
environment=ASAN_OPTIONS="log_path=/var/log/asan/coppmgrd-asan.log{{ asan_extra_options }}"
{% endif %}
{%- endif %}
{% if is_fabric_asic == 0 %}
[program:neighsyncd]
command=/usr/bin/neighsyncd
priority=7
autostart=false
autorestart=false
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=swssconfig:exited
{% if ENABLE_ASAN == "y" %}
environment=ASAN_OPTIONS="log_path=/var/log/asan/neighsyncd-asan.log{{ asan_extra_options }}"
{% endif %}
{%- endif %}
{% if is_fabric_asic == 0 %}
[program:vlanmgrd]
command=/usr/bin/vlanmgrd
priority=8
autostart=false
autorestart=false
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=swssconfig:exited
{% if ENABLE_ASAN == "y" %}
environment=ASAN_OPTIONS="log_path=/var/log/asan/vlanmgrd-asan.log{{ asan_extra_options }}"
{% endif %}
{%- endif %}
{% if is_fabric_asic == 0 %}
[program:intfmgrd]
command=/usr/bin/intfmgrd
priority=9
autostart=false
autorestart=false
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=swssconfig:exited
{% if ENABLE_ASAN == "y" %}
environment=ASAN_OPTIONS="log_path=/var/log/asan/intfmgrd-asan.log{{ asan_extra_options }}"
{% endif %}
{%- endif %}
{% if is_fabric_asic == 0 %}
[program:portmgrd]
command=/usr/bin/portmgrd
priority=10
autostart=false
autorestart=false
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=swssconfig:exited
{% if ENABLE_ASAN == "y" %}
environment=ASAN_OPTIONS="log_path=/var/log/asan/portmgrd-asan.log{{ asan_extra_options }}"
{% endif %}
{%- endif %}
{% if is_fabric_asic == 0 %}
[program:buffermgrd]
command=/usr/bin/buffermgrd.sh
priority=11
autostart=false
autorestart=false
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=swssconfig:exited
{% if ENABLE_ASAN == "y" %}
environment=ASAN_OPTIONS="log_path=/var/log/asan/buffermgrd-asan.log{{ asan_extra_options }}"
{% endif %}
{%- endif %}
{% if is_fabric_asic == 0 %}
[program:vrfmgrd]
command=/usr/bin/vrfmgrd
priority=13
autostart=false
autorestart=false
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=swssconfig:exited
{% if ENABLE_ASAN == "y" %}
environment=ASAN_OPTIONS="log_path=/var/log/asan/vrfmgrd-asan.log{{ asan_extra_options }}"
{% endif %}
{%- endif %}
{% if is_fabric_asic == 0 %}
[program:nbrmgrd]
command=/usr/bin/nbrmgrd
priority=15
autostart=false
autorestart=false
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=swssconfig:exited
{% if ENABLE_ASAN == "y" %}
environment=ASAN_OPTIONS="log_path=/var/log/asan/nbrmgrd-asan.log{{ asan_extra_options }}"
{% endif %}
{%- endif %}
{% if is_fabric_asic == 0 %}
[program:vxlanmgrd]
command=/usr/bin/vxlanmgrd
priority=16
autostart=false
autorestart=false
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=swssconfig:exited
{% if ENABLE_ASAN == "y" %}
environment=ASAN_OPTIONS="log_path=/var/log/asan/vxlanmgrd-asan.log{{ asan_extra_options }}"
{% endif %}
{%- endif %}
{% if is_fabric_asic == 0 %}
[program:tunnelmgrd]
command=/usr/bin/tunnelmgrd
priority=17
autostart=false
autorestart=false
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=swssconfig:exited
{% if ENABLE_ASAN == "y" %}
environment=ASAN_OPTIONS="log_path=/var/log/asan/tunnelmgrd-asan.log{{ asan_extra_options }}"
{% endif %}
{%- endif %}
[program:enable_counters]
command=/usr/bin/enable_counters.py
priority=12
autostart=false
autorestart=false
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=swssconfig:exited
{% if is_fabric_asic == 0 %}
[program:fdbsyncd]
command=/usr/bin/fdbsyncd
priority=17
autostart=false
autorestart=false
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=swssconfig:exited
{% if ENABLE_ASAN == "y" %}
environment=ASAN_OPTIONS="log_path=/var/log/asan/fdbsyncd-asan.log{{ asan_extra_options }}"
{% endif %}
{%- endif %}