[supervisor] Add patch to prevent 'supervisorctl start' command from hanging if system time has rolled backward (#1311)
* Add patch to prevent 'supervisorctl start' command from hanging if system time has rolled backward * Also add unit tests for clock rollback scenarios
This commit is contained in:
parent
134707f822
commit
0fa64cc618
1
.gitignore
vendored
1
.gitignore
vendored
@ -35,6 +35,7 @@ src/sonic-device-data/src/device/
|
||||
src/sonic-device-data/src/debian/
|
||||
src/supervisor/*
|
||||
!src/supervisor/Makefile
|
||||
!src/supervisor/patch/
|
||||
src/thrift/*
|
||||
!src/thrift/Makefile
|
||||
|
||||
|
@ -1,5 +1,9 @@
|
||||
# supervisor package
|
||||
|
||||
SUPERVISOR = python-supervisor_3.3.2-1_all.deb
|
||||
SUPERVISOR_VERSION = 3.3.2
|
||||
|
||||
export SUPERVISOR_VERSION
|
||||
|
||||
SUPERVISOR = python-supervisor_$(SUPERVISOR_VERSION)-1_all.deb
|
||||
$(SUPERVISOR)_SRC_PATH = $(SRC_PATH)/supervisor
|
||||
SONIC_MAKE_DEBS += $(SUPERVISOR)
|
||||
|
@ -2,7 +2,7 @@
|
||||
SHELL = /bin/bash
|
||||
.SHELLFLAGS += -e
|
||||
|
||||
MAIN_TARGET = python-supervisor_3.3.2-1_all.deb
|
||||
MAIN_TARGET = python-supervisor_$(SUPERVISOR_VERSION)-1_all.deb
|
||||
|
||||
$(addprefix $(DEST)/, $(MAIN_TARGET)): $(DEST)/% :
|
||||
# Remove any stale files
|
||||
@ -11,7 +11,15 @@ $(addprefix $(DEST)/, $(MAIN_TARGET)): $(DEST)/% :
|
||||
# Clone supervisor repo
|
||||
git clone https://github.com/Supervisor/supervisor.git
|
||||
pushd ./supervisor
|
||||
git checkout -f 3.3.2
|
||||
|
||||
# Reset HEAD to the commit of the proper tag
|
||||
# NOTE: Using "git checkout <tag_name>" here detaches our HEAD,
|
||||
# which stg doesn't like, so we use this method instead
|
||||
git reset --hard $(SUPERVISOR_VERSION)
|
||||
|
||||
# Apply patches
|
||||
stg init
|
||||
stg import -s ../patch/series
|
||||
|
||||
# Build Python and Debian package
|
||||
python setup.py --command-packages=stdeb.command bdist_deb
|
||||
|
@ -0,0 +1,83 @@
|
||||
From bac496af4c2051b3c15038e06533f9094b1c3604 Mon Sep 17 00:00:00 2001
|
||||
From: Joe LeVeque <jolevequ@microsoft.com>
|
||||
Date: Tue, 16 Jan 2018 20:42:13 +0000
|
||||
Subject: [PATCH 1/2] Prevent excessive hanging/delays in process state
|
||||
transitions if the system time rolls backward
|
||||
|
||||
---
|
||||
supervisor/process.py | 32 ++++++++++++++++++++++++++++++++
|
||||
1 file changed, 32 insertions(+)
|
||||
|
||||
diff --git a/supervisor/process.py b/supervisor/process.py
|
||||
index f9ddcd9..216372f 100644
|
||||
--- a/supervisor/process.py
|
||||
+++ b/supervisor/process.py
|
||||
@@ -357,6 +357,12 @@ class Subprocess:
|
||||
""" Log a 'waiting for x to stop' message with throttling. """
|
||||
if self.state == ProcessStates.STOPPING:
|
||||
now = time.time()
|
||||
+
|
||||
+ # If the system clock has moved backward, reset
|
||||
+ # self.laststopreport to current system time
|
||||
+ if now < self.laststopreport:
|
||||
+ self.laststopreport = now;
|
||||
+
|
||||
if now > (self.laststopreport + 2): # every 2 seconds
|
||||
self.config.options.logger.info(
|
||||
'waiting for %s to stop' % self.config.name)
|
||||
@@ -604,12 +610,27 @@ class Subprocess:
|
||||
# STOPPED -> STARTING
|
||||
self.spawn()
|
||||
elif state == ProcessStates.BACKOFF:
|
||||
+ if self.delay > 0 and now < self.delay - self.backoff:
|
||||
+ # The system clock appears to have moved backward
|
||||
+ # Reset self.delay accordingly
|
||||
+ self.delay = now + self.backoff
|
||||
+
|
||||
if self.backoff <= self.config.startretries:
|
||||
if now > self.delay:
|
||||
# BACKOFF -> STARTING
|
||||
self.spawn()
|
||||
|
||||
if state == ProcessStates.STARTING:
|
||||
+ if now < self.laststart:
|
||||
+ # The system clock appears to have moved backward
|
||||
+ # Reset self.laststart to current system time
|
||||
+ self.laststart = now;
|
||||
+
|
||||
+ if self.delay > 0 and now < self.delay - self.config.startsecs:
|
||||
+ # The system clock appears to have moved backward
|
||||
+ # Reset self.delay accordingly
|
||||
+ self.delay = now + self.config.startsecs
|
||||
+
|
||||
if now - self.laststart > self.config.startsecs:
|
||||
# STARTING -> RUNNING if the proc has started
|
||||
# successfully and it has stayed up for at least
|
||||
@@ -633,6 +654,11 @@ class Subprocess:
|
||||
logger.info('gave up: %s %s' % (self.config.name, msg))
|
||||
|
||||
elif state == ProcessStates.STOPPING:
|
||||
+ if self.delay > 0 and now < self.delay - self.config.stopwaitsecs:
|
||||
+ # The system clock appears to have moved backward
|
||||
+ # Reset self.delay accordingly
|
||||
+ self.delay = now + self.config.stopwaitsecs
|
||||
+
|
||||
time_left = self.delay - now
|
||||
if time_left <= 0:
|
||||
# kill processes which are taking too long to stop with a final
|
||||
@@ -805,6 +831,12 @@ class EventListenerPool(ProcessGroupBase):
|
||||
if dispatch_capable:
|
||||
if self.dispatch_throttle:
|
||||
now = time.time()
|
||||
+
|
||||
+ # If the system clock has moved backward, reset
|
||||
+ # self.last_dispatch to current system time
|
||||
+ if now < self.last_dispatch:
|
||||
+ self.last_dispatch = now;
|
||||
+
|
||||
if now - self.last_dispatch < self.dispatch_throttle:
|
||||
return
|
||||
self.dispatch()
|
||||
--
|
||||
2.1.4
|
||||
|
@ -0,0 +1,184 @@
|
||||
From 43b6f3ee93593f666af2c6a7af897bf3b5ee762d Mon Sep 17 00:00:00 2001
|
||||
From: Joe LeVeque <jolevequ@microsoft.com>
|
||||
Date: Wed, 17 Jan 2018 22:27:56 +0000
|
||||
Subject: [PATCH 2/2] Add unit test cases to ensure state transitions work if
|
||||
system time has rolled backward
|
||||
|
||||
---
|
||||
supervisor/tests/test_process.py | 146 +++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 146 insertions(+)
|
||||
|
||||
diff --git a/supervisor/tests/test_process.py b/supervisor/tests/test_process.py
|
||||
index 7f9bab2..ea16315 100644
|
||||
--- a/supervisor/tests/test_process.py
|
||||
+++ b/supervisor/tests/test_process.py
|
||||
@@ -736,6 +736,40 @@ class SubprocessTests(unittest.TestCase):
|
||||
instance.stop_report()
|
||||
self.assertEqual(len(options.logger.data), 1) # throttled
|
||||
|
||||
+ def test_stop_report_laststopreport_in_future(self):
|
||||
+ future_time = time.time() + 3600 # 1 hour into the future
|
||||
+ options = DummyOptions()
|
||||
+ config = DummyPConfig(options, 'test', '/test')
|
||||
+ instance = self._makeOne(config)
|
||||
+ instance.pid = 11
|
||||
+ dispatcher = DummyDispatcher(writable=True)
|
||||
+ instance.dispatchers = {'foo':dispatcher}
|
||||
+ from supervisor.states import ProcessStates
|
||||
+ instance.state = ProcessStates.STOPPING
|
||||
+ instance.laststopreport = future_time
|
||||
+
|
||||
+ # This iteration of stop_report() should reset instance.laststopreport
|
||||
+ # to the current time
|
||||
+ instance.stop_report()
|
||||
+
|
||||
+ # No logging should have taken place
|
||||
+ self.assertEqual(len(options.logger.data), 0)
|
||||
+
|
||||
+ # Ensure instance.laststopreport has rolled backward
|
||||
+ self.assertTrue(instance.laststopreport < future_time)
|
||||
+
|
||||
+ # Sleep for 2 seconds
|
||||
+ time.sleep(2)
|
||||
+
|
||||
+ # This iteration of stop_report() should actaully trigger the report
|
||||
+ instance.stop_report()
|
||||
+
|
||||
+ self.assertEqual(len(options.logger.data), 1)
|
||||
+ self.assertEqual(options.logger.data[0], 'waiting for test to stop')
|
||||
+ self.assertNotEqual(instance.laststopreport, 0)
|
||||
+ instance.stop_report()
|
||||
+ self.assertEqual(len(options.logger.data), 1) # throttled
|
||||
+
|
||||
def test_give_up(self):
|
||||
options = DummyOptions()
|
||||
config = DummyPConfig(options, 'test', '/test')
|
||||
@@ -1402,6 +1436,92 @@ class SubprocessTests(unittest.TestCase):
|
||||
event = L[0]
|
||||
self.assertEqual(event.__class__, events.ProcessStateRunningEvent)
|
||||
|
||||
+ def test_transition_starting_to_running_laststart_in_future(self):
|
||||
+ from supervisor import events
|
||||
+ L = []
|
||||
+ events.subscribe(events.ProcessStateEvent, lambda x: L.append(x))
|
||||
+ from supervisor.states import ProcessStates
|
||||
+
|
||||
+ future_time = time.time() + 3600 # 1 hour into the future
|
||||
+ options = DummyOptions()
|
||||
+ test_startsecs = 2
|
||||
+
|
||||
+ # this should go from STARTING to RUNNING via transition()
|
||||
+ pconfig = DummyPConfig(options, 'process', 'process','/bin/process',
|
||||
+ startsecs=test_startsecs)
|
||||
+ process = self._makeOne(pconfig)
|
||||
+ process.backoff = 1
|
||||
+ process.delay = 1
|
||||
+ process.system_stop = False
|
||||
+ process.laststart = future_time
|
||||
+ process.pid = 1
|
||||
+ process.stdout_buffer = 'abc'
|
||||
+ process.stderr_buffer = 'def'
|
||||
+ process.state = ProcessStates.STARTING
|
||||
+
|
||||
+ # This iteration of transition() should reset process.laststart
|
||||
+ # to the current time
|
||||
+ process.transition()
|
||||
+
|
||||
+ # Process state should still be STARTING
|
||||
+ self.assertEqual(process.state, ProcessStates.STARTING)
|
||||
+
|
||||
+ # Ensure process.laststart has rolled backward
|
||||
+ self.assertTrue(process.laststart < future_time)
|
||||
+
|
||||
+ # Sleep for (startsecs + 1)
|
||||
+ time.sleep(test_startsecs + 1)
|
||||
+
|
||||
+ # This iteration of transition() should actaully trigger the state
|
||||
+ # transition to RUNNING
|
||||
+ process.transition()
|
||||
+
|
||||
+ # this implies RUNNING
|
||||
+ self.assertEqual(process.backoff, 0)
|
||||
+ self.assertEqual(process.delay, 0)
|
||||
+ self.assertFalse(process.system_stop)
|
||||
+ self.assertEqual(process.state, ProcessStates.RUNNING)
|
||||
+ self.assertEqual(options.logger.data[0],
|
||||
+ 'success: process entered RUNNING state, process has '
|
||||
+ 'stayed up for > than {} seconds (startsecs)'.format(test_startsecs))
|
||||
+ self.assertEqual(len(L), 1)
|
||||
+ event = L[0]
|
||||
+ self.assertEqual(event.__class__, events.ProcessStateRunningEvent)
|
||||
+
|
||||
+ def test_transition_backoff_to_starting_delay_in_future(self):
|
||||
+ from supervisor import events
|
||||
+ L = []
|
||||
+ events.subscribe(events.ProcessStateEvent, lambda x: L.append(x))
|
||||
+ from supervisor.states import ProcessStates, SupervisorStates
|
||||
+
|
||||
+ future_time = time.time() + 3600 # 1 hour into the future
|
||||
+ options = DummyOptions()
|
||||
+
|
||||
+ pconfig = DummyPConfig(options, 'process', 'process','/bin/process')
|
||||
+ process = self._makeOne(pconfig)
|
||||
+ process.laststart = 1
|
||||
+ process.delay = future_time
|
||||
+ process.backoff = 0
|
||||
+ process.state = ProcessStates.BACKOFF
|
||||
+
|
||||
+ # This iteration of transition() should reset process.delay
|
||||
+ # to the current time
|
||||
+ process.transition()
|
||||
+
|
||||
+ # Process state should still be BACKOFF
|
||||
+ self.assertEqual(process.state, ProcessStates.BACKOFF)
|
||||
+
|
||||
+ # Ensure process.delay has rolled backward
|
||||
+ self.assertTrue(process.delay < future_time)
|
||||
+
|
||||
+ # This iteration of transition() should actaully trigger the state
|
||||
+ # transition to STARTING
|
||||
+ process.transition()
|
||||
+
|
||||
+ self.assertEqual(process.state, ProcessStates.STARTING)
|
||||
+ self.assertEqual(len(L), 1)
|
||||
+ self.assertEqual(L[0].__class__, events.ProcessStateStartingEvent)
|
||||
+
|
||||
def test_transition_backoff_to_fatal(self):
|
||||
from supervisor import events
|
||||
L = []
|
||||
@@ -2033,6 +2153,32 @@ class EventListenerPoolTests(ProcessGroupBaseTests):
|
||||
self.assertEqual(process1.listener_state, EventListenerStates.BUSY)
|
||||
self.assertEqual(process1.event, event)
|
||||
|
||||
+ def test_transition_event_proc_running_with_dispatch_throttle_last_dispatch_in_future(self):
|
||||
+ future_time = time.time() + 3600 # 1 hour into the future
|
||||
+ options = DummyOptions()
|
||||
+ from supervisor.states import ProcessStates
|
||||
+ pconfig1 = DummyPConfig(options, 'process1', 'process1','/bin/process1')
|
||||
+ process1 = DummyProcess(pconfig1, state=ProcessStates.RUNNING)
|
||||
+ gconfig = DummyPGroupConfig(options, pconfigs=[pconfig1])
|
||||
+ pool = self._makeOne(gconfig)
|
||||
+ pool.dispatch_throttle = 5
|
||||
+ pool.last_dispatch = future_time
|
||||
+ pool.processes = {'process1': process1}
|
||||
+ event = DummyEvent()
|
||||
+ from supervisor.states import EventListenerStates
|
||||
+ process1.listener_state = EventListenerStates.READY
|
||||
+ class DummyGroup:
|
||||
+ config = gconfig
|
||||
+ process1.group = DummyGroup
|
||||
+ pool._acceptEvent(event)
|
||||
+ pool.transition()
|
||||
+
|
||||
+ self.assertEqual(process1.transitioned, True)
|
||||
+ self.assertEqual(pool.event_buffer, [event]) # not popped
|
||||
+
|
||||
+ # Ensure pool.last_dispatch has been rolled backward
|
||||
+ self.assertTrue(pool.last_dispatch < future_time)
|
||||
+
|
||||
def test__dispatchEvent_notready(self):
|
||||
options = DummyOptions()
|
||||
from supervisor.states import ProcessStates
|
||||
--
|
||||
2.1.4
|
||||
|
2
src/supervisor/patch/series
Normal file
2
src/supervisor/patch/series
Normal file
@ -0,0 +1,2 @@
|
||||
0001-Prevent-excessive-hanging-delays-in-process-state-tr.patch
|
||||
0002-Add-unit-test-cases-to-ensure-state-transitions-work.patch
|
Loading…
Reference in New Issue
Block a user