Loading tools/edit_monitor/daemon_manager.py +53 −6 Original line number Diff line number Diff line Loading @@ -30,6 +30,7 @@ DEFAULT_MONITOR_INTERVAL_SECONDS = 5 DEFAULT_MEMORY_USAGE_THRESHOLD = 2000 DEFAULT_CPU_USAGE_THRESHOLD = 200 DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24 BLOCK_SIGN_FILE = "edit_monitor_block_sign" def default_daemon_target(): Loading Loading @@ -59,15 +60,19 @@ class DaemonManager: pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor") pid_file_dir.mkdir(parents=True, exist_ok=True) self.pid_file_path = self._get_pid_file_path(pid_file_dir) self.block_sign = pathlib.Path(tempfile.gettempdir()).joinpath( BLOCK_SIGN_FILE ) def start(self): """Writes the pidfile and starts the daemon proces.""" try: if self.block_sign.exists(): logging.warning("Block sign found, exiting...") return self._stop_any_existing_instance() self._write_pid_to_pidfile() self._start_daemon_process() except Exception as e: logging.exception("Failed to start daemon manager with error %s", e) def monitor_daemon( self, Loading @@ -82,6 +87,9 @@ class DaemonManager: process is still running and kill the process if the resource usage is above given thresholds. """ if not self.daemon_process: return logging.info("start monitoring daemon process %d.", self.daemon_process.pid) reboot_time = time.time() + reboot_timeout while self.daemon_process.is_alive(): Loading Loading @@ -150,6 +158,33 @@ class DaemonManager: logging.exception("Failed to reboot process with error: %s.", e) sys.exit(1) # Indicate an error occurred def cleanup(self): """Wipes out all edit monitor instances in the system. Stops all the existing edit monitor instances and place a block sign to prevent any edit monitor process to start. This method is only used in emergency case when there's something goes wrong with the edit monitor that requires immediate cleanup to prevent damanger to the system. """ logging.debug("Start cleaning up all existing instances.") try: # First places a block sign to prevent any edit monitor process to start. self.block_sign.touch() except (FileNotFoundError, PermissionError, OSError): logging.exception("Failed to place the block sign") # Finds and kills all the existing instances of edit monitor. existing_instances_pids = self._find_all_instances_pids() for pid in existing_instances_pids: logging.info( "Found existing edit monitor instance with pid %d, killing...", pid ) try: self._terminate_process(pid) except Exception: logging.exception("Failed to terminate process %d", pid) def _stop_any_existing_instance(self): if not self.pid_file_path.exists(): logging.debug("No existing instances.") Loading Loading @@ -300,3 +335,15 @@ class DaemonManager: stime = int(stats[14]) return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"]) def _find_all_instances_pids(self) -> list[int]: pids = [] for file in os.listdir(self.pid_file_path.parent): if file.endswith(".lock"): try: with open(self.pid_file_path.parent.joinpath(file), "r") as f: pids.append(int(f.read().strip())) except (FileNotFoundError, IOError, ValueError, TypeError): logging.exception("Failed to get pid from file path: %s", file) return pids No newline at end of file tools/edit_monitor/daemon_manager_test.py +43 −31 Original line number Diff line number Diff line Loading @@ -27,6 +27,7 @@ import unittest from unittest import mock from edit_monitor import daemon_manager TEST_BINARY_FILE = '/path/to/test_binary' TEST_PID_FILE_PATH = ( '587239c2d1050afdf54512e2d799f3b929f86b43575eb3c7b4bab105dd9bd25e.lock' Loading Loading @@ -92,20 +93,10 @@ class DaemonManagerTest(unittest.TestCase): self.assert_run_simple_daemon_success() def test_start_success_with_existing_instance_running(self): # Create a long running subprocess p = multiprocessing.Process(target=long_running_daemon) p.start() # Create a pidfile with the subprocess pid pid_file_path_dir = pathlib.Path(self.working_dir.name).joinpath( 'edit_monitor' ) pid_file_path_dir.mkdir(parents=True, exist_ok=True) with open(pid_file_path_dir.joinpath(TEST_PID_FILE_PATH), 'w') as f: f.write(str(p.pid)) # Create a running daemon subprocess p = self._create_fake_deamon_process() self.assert_run_simple_daemon_success() p.terminate() def test_start_success_with_existing_instance_already_dead(self): # Create a pidfile with pid that does not exist. Loading @@ -129,6 +120,17 @@ class DaemonManagerTest(unittest.TestCase): self.assert_run_simple_daemon_success() existing_dm.stop() def test_start_return_directly_if_block_sign_exists(self): # Creates the block sign. pathlib.Path(self.working_dir.name).joinpath( daemon_manager.BLOCK_SIGN_FILE ).touch() dm = daemon_manager.DaemonManager(TEST_BINARY_FILE) dm.start() # Verify no daemon process is started. self.assertIsNone(dm.daemon_process) @mock.patch('os.kill') def test_start_failed_to_kill_existing_instance(self, mock_kill): mock_kill.side_effect = OSError('Unknown OSError') Loading @@ -139,12 +141,10 @@ class DaemonManagerTest(unittest.TestCase): with open(pid_file_path_dir.joinpath(TEST_PID_FILE_PATH), 'w') as f: f.write('123456') with self.assertRaises(OSError) as error: dm = daemon_manager.DaemonManager(TEST_BINARY_FILE) dm.start() # Verify no daemon process is started. self.assertIsNone(dm.daemon_process) def test_start_failed_to_write_pidfile(self): pid_file_path_dir = pathlib.Path(self.working_dir.name).joinpath( 'edit_monitor' Loading @@ -153,21 +153,17 @@ class DaemonManagerTest(unittest.TestCase): # Makes the directory read-only so write pidfile will fail. os.chmod(pid_file_path_dir, 0o555) with self.assertRaises(PermissionError) as error: dm = daemon_manager.DaemonManager(TEST_BINARY_FILE) dm.start() # Verifies no daemon process is started. self.assertIsNone(dm.daemon_process) def test_start_failed_to_start_daemon_process(self): with self.assertRaises(TypeError) as error: dm = daemon_manager.DaemonManager( TEST_BINARY_FILE, daemon_target='wrong_target', daemon_args=(1) ) dm.start() # Verifies no daemon process is started. self.assertIsNone(dm.daemon_process) def test_monitor_daemon_subprocess_killed_high_memory_usage(self): dm = daemon_manager.DaemonManager( TEST_BINARY_FILE, Loading Loading @@ -321,7 +317,7 @@ class DaemonManagerTest(unittest.TestCase): self._is_process_alive(child_pid), f'process {child_pid} still alive' ) def _get_child_processes(self, parent_pid): def _get_child_processes(self, parent_pid: int) -> list[int]: try: output = subprocess.check_output( ['ps', '-o', 'pid,ppid', '--no-headers'], text=True Loading @@ -336,7 +332,7 @@ class DaemonManagerTest(unittest.TestCase): except subprocess.CalledProcessError as e: self.fail(f'failed to get child process, error: {e}') def _is_process_alive(self, pid): def _is_process_alive(self, pid: int) -> bool: try: output = subprocess.check_output( ['ps', '-p', str(pid), '-o', 'state='], text=True Loading @@ -355,6 +351,22 @@ class DaemonManagerTest(unittest.TestCase): # process already terminated pass def _create_fake_deamon_process( self, name: str = '' ) -> multiprocessing.Process: # Create a long running subprocess p = multiprocessing.Process(target=long_running_daemon) p.start() # Create the pidfile with the subprocess pid pid_file_path_dir = pathlib.Path(self.working_dir.name).joinpath( 'edit_monitor' ) pid_file_path_dir.mkdir(parents=True, exist_ok=True) with open(pid_file_path_dir.joinpath(name + 'pid.lock'), 'w') as f: f.write(str(p.pid)) return p if __name__ == '__main__': unittest.main() Loading
tools/edit_monitor/daemon_manager.py +53 −6 Original line number Diff line number Diff line Loading @@ -30,6 +30,7 @@ DEFAULT_MONITOR_INTERVAL_SECONDS = 5 DEFAULT_MEMORY_USAGE_THRESHOLD = 2000 DEFAULT_CPU_USAGE_THRESHOLD = 200 DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24 BLOCK_SIGN_FILE = "edit_monitor_block_sign" def default_daemon_target(): Loading Loading @@ -59,15 +60,19 @@ class DaemonManager: pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor") pid_file_dir.mkdir(parents=True, exist_ok=True) self.pid_file_path = self._get_pid_file_path(pid_file_dir) self.block_sign = pathlib.Path(tempfile.gettempdir()).joinpath( BLOCK_SIGN_FILE ) def start(self): """Writes the pidfile and starts the daemon proces.""" try: if self.block_sign.exists(): logging.warning("Block sign found, exiting...") return self._stop_any_existing_instance() self._write_pid_to_pidfile() self._start_daemon_process() except Exception as e: logging.exception("Failed to start daemon manager with error %s", e) def monitor_daemon( self, Loading @@ -82,6 +87,9 @@ class DaemonManager: process is still running and kill the process if the resource usage is above given thresholds. """ if not self.daemon_process: return logging.info("start monitoring daemon process %d.", self.daemon_process.pid) reboot_time = time.time() + reboot_timeout while self.daemon_process.is_alive(): Loading Loading @@ -150,6 +158,33 @@ class DaemonManager: logging.exception("Failed to reboot process with error: %s.", e) sys.exit(1) # Indicate an error occurred def cleanup(self): """Wipes out all edit monitor instances in the system. Stops all the existing edit monitor instances and place a block sign to prevent any edit monitor process to start. This method is only used in emergency case when there's something goes wrong with the edit monitor that requires immediate cleanup to prevent damanger to the system. """ logging.debug("Start cleaning up all existing instances.") try: # First places a block sign to prevent any edit monitor process to start. self.block_sign.touch() except (FileNotFoundError, PermissionError, OSError): logging.exception("Failed to place the block sign") # Finds and kills all the existing instances of edit monitor. existing_instances_pids = self._find_all_instances_pids() for pid in existing_instances_pids: logging.info( "Found existing edit monitor instance with pid %d, killing...", pid ) try: self._terminate_process(pid) except Exception: logging.exception("Failed to terminate process %d", pid) def _stop_any_existing_instance(self): if not self.pid_file_path.exists(): logging.debug("No existing instances.") Loading Loading @@ -300,3 +335,15 @@ class DaemonManager: stime = int(stats[14]) return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"]) def _find_all_instances_pids(self) -> list[int]: pids = [] for file in os.listdir(self.pid_file_path.parent): if file.endswith(".lock"): try: with open(self.pid_file_path.parent.joinpath(file), "r") as f: pids.append(int(f.read().strip())) except (FileNotFoundError, IOError, ValueError, TypeError): logging.exception("Failed to get pid from file path: %s", file) return pids No newline at end of file
tools/edit_monitor/daemon_manager_test.py +43 −31 Original line number Diff line number Diff line Loading @@ -27,6 +27,7 @@ import unittest from unittest import mock from edit_monitor import daemon_manager TEST_BINARY_FILE = '/path/to/test_binary' TEST_PID_FILE_PATH = ( '587239c2d1050afdf54512e2d799f3b929f86b43575eb3c7b4bab105dd9bd25e.lock' Loading Loading @@ -92,20 +93,10 @@ class DaemonManagerTest(unittest.TestCase): self.assert_run_simple_daemon_success() def test_start_success_with_existing_instance_running(self): # Create a long running subprocess p = multiprocessing.Process(target=long_running_daemon) p.start() # Create a pidfile with the subprocess pid pid_file_path_dir = pathlib.Path(self.working_dir.name).joinpath( 'edit_monitor' ) pid_file_path_dir.mkdir(parents=True, exist_ok=True) with open(pid_file_path_dir.joinpath(TEST_PID_FILE_PATH), 'w') as f: f.write(str(p.pid)) # Create a running daemon subprocess p = self._create_fake_deamon_process() self.assert_run_simple_daemon_success() p.terminate() def test_start_success_with_existing_instance_already_dead(self): # Create a pidfile with pid that does not exist. Loading @@ -129,6 +120,17 @@ class DaemonManagerTest(unittest.TestCase): self.assert_run_simple_daemon_success() existing_dm.stop() def test_start_return_directly_if_block_sign_exists(self): # Creates the block sign. pathlib.Path(self.working_dir.name).joinpath( daemon_manager.BLOCK_SIGN_FILE ).touch() dm = daemon_manager.DaemonManager(TEST_BINARY_FILE) dm.start() # Verify no daemon process is started. self.assertIsNone(dm.daemon_process) @mock.patch('os.kill') def test_start_failed_to_kill_existing_instance(self, mock_kill): mock_kill.side_effect = OSError('Unknown OSError') Loading @@ -139,12 +141,10 @@ class DaemonManagerTest(unittest.TestCase): with open(pid_file_path_dir.joinpath(TEST_PID_FILE_PATH), 'w') as f: f.write('123456') with self.assertRaises(OSError) as error: dm = daemon_manager.DaemonManager(TEST_BINARY_FILE) dm.start() # Verify no daemon process is started. self.assertIsNone(dm.daemon_process) def test_start_failed_to_write_pidfile(self): pid_file_path_dir = pathlib.Path(self.working_dir.name).joinpath( 'edit_monitor' Loading @@ -153,21 +153,17 @@ class DaemonManagerTest(unittest.TestCase): # Makes the directory read-only so write pidfile will fail. os.chmod(pid_file_path_dir, 0o555) with self.assertRaises(PermissionError) as error: dm = daemon_manager.DaemonManager(TEST_BINARY_FILE) dm.start() # Verifies no daemon process is started. self.assertIsNone(dm.daemon_process) def test_start_failed_to_start_daemon_process(self): with self.assertRaises(TypeError) as error: dm = daemon_manager.DaemonManager( TEST_BINARY_FILE, daemon_target='wrong_target', daemon_args=(1) ) dm.start() # Verifies no daemon process is started. self.assertIsNone(dm.daemon_process) def test_monitor_daemon_subprocess_killed_high_memory_usage(self): dm = daemon_manager.DaemonManager( TEST_BINARY_FILE, Loading Loading @@ -321,7 +317,7 @@ class DaemonManagerTest(unittest.TestCase): self._is_process_alive(child_pid), f'process {child_pid} still alive' ) def _get_child_processes(self, parent_pid): def _get_child_processes(self, parent_pid: int) -> list[int]: try: output = subprocess.check_output( ['ps', '-o', 'pid,ppid', '--no-headers'], text=True Loading @@ -336,7 +332,7 @@ class DaemonManagerTest(unittest.TestCase): except subprocess.CalledProcessError as e: self.fail(f'failed to get child process, error: {e}') def _is_process_alive(self, pid): def _is_process_alive(self, pid: int) -> bool: try: output = subprocess.check_output( ['ps', '-p', str(pid), '-o', 'state='], text=True Loading @@ -355,6 +351,22 @@ class DaemonManagerTest(unittest.TestCase): # process already terminated pass def _create_fake_deamon_process( self, name: str = '' ) -> multiprocessing.Process: # Create a long running subprocess p = multiprocessing.Process(target=long_running_daemon) p.start() # Create the pidfile with the subprocess pid pid_file_path_dir = pathlib.Path(self.working_dir.name).joinpath( 'edit_monitor' ) pid_file_path_dir.mkdir(parents=True, exist_ok=True) with open(pid_file_path_dir.joinpath(name + 'pid.lock'), 'w') as f: f.write(str(p.pid)) return p if __name__ == '__main__': unittest.main()