Loading tools/edit_monitor/daemon_manager.py +53 −5 Original line number Diff line number Diff line Loading @@ -13,17 +13,22 @@ # limitations under the License. import getpass import hashlib import logging import multiprocessing import os import pathlib import platform import signal import subprocess import sys import tempfile import time from atest.metrics import clearcut_client from atest.proto import clientanalytics_pb2 from proto import edit_event_pb2 DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 1 DEFAULT_MONITOR_INTERVAL_SECONDS = 5 Loading @@ -31,6 +36,9 @@ DEFAULT_MEMORY_USAGE_THRESHOLD = 2000 DEFAULT_CPU_USAGE_THRESHOLD = 200 DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24 BLOCK_SIGN_FILE = "edit_monitor_block_sign" # Enum of the Clearcut log source defined under # /google3/wireless/android/play/playlog/proto/log_source_enum.proto LOG_SOURCE = 2524 def default_daemon_target(): Loading @@ -46,11 +54,16 @@ class DaemonManager: binary_path: str, daemon_target: callable = default_daemon_target, daemon_args: tuple = (), cclient: clearcut_client.Clearcut | None = None, ): self.binary_path = binary_path self.daemon_target = daemon_target self.daemon_args = daemon_args self.cclient = cclient or clearcut_client.Clearcut(LOG_SOURCE) self.user_name = getpass.getuser() self.host_name = platform.node() self.source_root = os.environ.get("ANDROID_BUILD_TOP", "") self.pid = os.getpid() self.daemon_process = None Loading @@ -70,13 +83,20 @@ class DaemonManager: logging.warning("Block sign found, exiting...") return if self.binary_path.startswith('/google/cog/'): if self.binary_path.startswith("/google/cog/"): logging.warning("Edit monitor for cog is not supported, exiting...") return try: self._stop_any_existing_instance() self._write_pid_to_pidfile() self._start_daemon_process() except Exception as e: logging.exception("Failed to start daemon manager with error %s", e) self._send_error_event_to_clearcut( edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR ) raise e def monitor_daemon( self, Loading Loading @@ -118,6 +138,9 @@ class DaemonManager: logging.error( "Daemon process is consuming too much resource, killing..." ), self._send_error_event_to_clearcut( edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_RESOURCE_USAGE ) self._terminate_process(self.daemon_process.pid) logging.info( Loading @@ -143,6 +166,12 @@ class DaemonManager: logging.debug("Successfully stopped daemon manager.") except Exception as e: logging.exception("Failed to stop daemon manager with error %s", e) self._send_error_event_to_clearcut( edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR ) sys.exit(1) finally: self.cclient.flush_events() def reboot(self): """Reboots the current process. Loading @@ -164,6 +193,9 @@ class DaemonManager: os.execv(self.binary_path, sys.argv) except OSError as e: logging.exception("Failed to reboot process with error: %s.", e) self._send_error_event_to_clearcut( edit_event_pb2.EditEvent.FAILED_TO_REBOOT_EDIT_MONITOR ) sys.exit(1) # Indicate an error occurred def cleanup(self): Loading @@ -175,6 +207,7 @@ class DaemonManager: that requires immediate cleanup to prevent damanger to the system. """ logging.debug("Start cleaning up all existing instances.") self._send_error_event_to_clearcut(edit_event_pb2.EditEvent.FORCE_CLEANUP) try: # First places a block sign to prevent any edit monitor process to start. Loading Loading @@ -356,3 +389,18 @@ class DaemonManager: logging.exception("Failed to get pid from file path: %s", file) return pids def _send_error_event_to_clearcut(self, error_type): edit_monitor_error_event_proto = edit_event_pb2.EditEvent( user_name=self.user_name, host_name=self.host_name, source_root=self.source_root, ) edit_monitor_error_event_proto.edit_monitor_error_event.CopyFrom( edit_event_pb2.EditEvent.EditMonitorErrorEvent(error_type=error_type) ) log_event = clientanalytics_pb2.LogEvent( event_time_ms=int(time.time() * 1000), source_extension=edit_monitor_error_event_proto.SerializeToString(), ) self.cclient.log(log_event) tools/edit_monitor/daemon_manager_test.py +99 −21 Original line number Diff line number Diff line Loading @@ -26,6 +26,7 @@ import time import unittest from unittest import mock from edit_monitor import daemon_manager from proto import edit_event_pb2 TEST_BINARY_FILE = '/path/to/test_binary' Loading Loading @@ -133,7 +134,8 @@ class DaemonManagerTest(unittest.TestCase): def test_start_return_directly_if_in_cog_env(self): dm = daemon_manager.DaemonManager( '/google/cog/cloud/user/workspace/edit_monitor') '/google/cog/cloud/user/workspace/edit_monitor' ) dm.start() # Verify no daemon process is started. self.assertIsNone(dm.daemon_process) Loading @@ -148,9 +150,13 @@ class DaemonManagerTest(unittest.TestCase): with open(pid_file_path_dir.joinpath(TEST_PID_FILE_PATH), 'w') as f: f.write('123456') with self.assertRaises(OSError) as error: dm = daemon_manager.DaemonManager(TEST_BINARY_FILE) fake_cclient = FakeClearcutClient() with self.assertRaises(OSError): dm = daemon_manager.DaemonManager(TEST_BINARY_FILE, cclient=fake_cclient) dm.start() self._assert_error_event_logged( fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR ) def test_start_failed_to_write_pidfile(self): pid_file_path_dir = pathlib.Path(self.working_dir.name).joinpath( Loading @@ -160,40 +166,63 @@ class DaemonManagerTest(unittest.TestCase): # Makes the directory read-only so write pidfile will fail. os.chmod(pid_file_path_dir, 0o555) with self.assertRaises(PermissionError) as error: dm = daemon_manager.DaemonManager(TEST_BINARY_FILE) fake_cclient = FakeClearcutClient() with self.assertRaises(PermissionError): dm = daemon_manager.DaemonManager(TEST_BINARY_FILE, cclient=fake_cclient) dm.start() self._assert_error_event_logged( fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR ) def test_start_failed_to_start_daemon_process(self): with self.assertRaises(TypeError) as error: fake_cclient = FakeClearcutClient() with self.assertRaises(TypeError): dm = daemon_manager.DaemonManager( TEST_BINARY_FILE, daemon_target='wrong_target', daemon_args=(1) TEST_BINARY_FILE, daemon_target='wrong_target', daemon_args=(1), cclient=fake_cclient, ) dm.start() self._assert_error_event_logged( fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR ) def test_monitor_daemon_subprocess_killed_high_memory_usage(self): fake_cclient = FakeClearcutClient() dm = daemon_manager.DaemonManager( TEST_BINARY_FILE, daemon_target=memory_consume_daemon_target, daemon_args=(2,), cclient=fake_cclient, ) dm.start() dm.monitor_daemon(interval=1, memory_threshold=2) self.assertTrue(dm.max_memory_usage >= 2) self.assert_no_subprocess_running() self._assert_error_event_logged( fake_cclient, edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_RESOURCE_USAGE, ) def test_monitor_daemon_subprocess_killed_high_cpu_usage(self): fake_cclient = FakeClearcutClient() dm = daemon_manager.DaemonManager( TEST_BINARY_FILE, daemon_target=cpu_consume_daemon_target, daemon_args=(20,), cclient=fake_cclient, ) dm.start() dm.monitor_daemon(interval=1, cpu_threshold=20) self.assertTrue(dm.max_cpu_usage >= 20) self.assert_no_subprocess_running() self._assert_error_event_logged( fake_cclient, edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_RESOURCE_USAGE, ) @mock.patch('subprocess.check_output') def test_monitor_daemon_failed_does_not_matter(self, mock_output): Loading @@ -207,7 +236,8 @@ class DaemonManagerTest(unittest.TestCase): ) dm = daemon_manager.DaemonManager( binary_file.name, daemon_target=long_running_daemon binary_file.name, daemon_target=long_running_daemon, ) dm.start() dm.monitor_daemon(reboot_timeout=0.5) Loading @@ -226,28 +256,43 @@ class DaemonManagerTest(unittest.TestCase): @mock.patch('os.kill') def test_stop_failed_to_kill_daemon_process(self, mock_kill): mock_kill.side_effect = OSError('Unknown OSError') fake_cclient = FakeClearcutClient() dm = daemon_manager.DaemonManager( TEST_BINARY_FILE, daemon_target=long_running_daemon TEST_BINARY_FILE, daemon_target=long_running_daemon, cclient=fake_cclient, ) with self.assertRaises(SystemExit): dm.start() dm.stop() self.assertTrue(dm.daemon_process.is_alive()) self.assertTrue(dm.pid_file_path.exists()) self._assert_error_event_logged( fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR ) @mock.patch('os.remove') def test_stop_failed_to_remove_pidfile(self, mock_remove): mock_remove.side_effect = OSError('Unknown OSError') fake_cclient = FakeClearcutClient() dm = daemon_manager.DaemonManager( TEST_BINARY_FILE, daemon_target=long_running_daemon TEST_BINARY_FILE, daemon_target=long_running_daemon, cclient=fake_cclient, ) with self.assertRaises(SystemExit): dm.start() dm.stop() self.assert_no_subprocess_running() self.assertTrue(dm.pid_file_path.exists()) self._assert_error_event_logged( fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR ) @mock.patch('os.execv') def test_reboot_success(self, mock_execv): binary_file = tempfile.NamedTemporaryFile( Loading @@ -273,7 +318,7 @@ class DaemonManagerTest(unittest.TestCase): ) dm.start() with self.assertRaises(SystemExit) as cm: with self.assertRaises(SystemExit): dm.reboot() mock_execv.assert_not_called() self.assertEqual(cm.exception.code, 0) Loading @@ -281,18 +326,24 @@ class DaemonManagerTest(unittest.TestCase): @mock.patch('os.execv') def test_reboot_failed(self, mock_execv): mock_execv.side_effect = OSError('Unknown OSError') fake_cclient = FakeClearcutClient() binary_file = tempfile.NamedTemporaryFile( dir=self.working_dir.name, delete=False ) dm = daemon_manager.DaemonManager( binary_file.name, daemon_target=long_running_daemon binary_file.name, daemon_target=long_running_daemon, cclient=fake_cclient, ) dm.start() with self.assertRaises(SystemExit) as cm: with self.assertRaises(SystemExit): dm.reboot() self.assertEqual(cm.exception.code, 1) self._assert_error_event_logged( fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_REBOOT_EDIT_MONITOR ) def assert_run_simple_daemon_success(self): damone_output_file = tempfile.NamedTemporaryFile( Loading Loading @@ -374,6 +425,33 @@ class DaemonManagerTest(unittest.TestCase): f.write(str(p.pid)) return p def _assert_error_event_logged(self, fake_cclient, error_type): error_events = fake_cclient.get_sent_events() self.assertEquals(len(error_events), 1) self.assertEquals( edit_event_pb2.EditEvent.FromString( error_events[0].source_extension ).edit_monitor_error_event.error_type, error_type, ) class FakeClearcutClient: def __init__(self): self.pending_log_events = [] self.sent_log_event = [] def log(self, log_event): self.pending_log_events.append(log_event) def flush_events(self): self.sent_log_event.extend(self.pending_log_events) self.pending_log_events.clear() def get_sent_events(self): return self.sent_log_event + self.pending_log_events if __name__ == '__main__': unittest.main() Loading
tools/edit_monitor/daemon_manager.py +53 −5 Original line number Diff line number Diff line Loading @@ -13,17 +13,22 @@ # limitations under the License. import getpass import hashlib import logging import multiprocessing import os import pathlib import platform import signal import subprocess import sys import tempfile import time from atest.metrics import clearcut_client from atest.proto import clientanalytics_pb2 from proto import edit_event_pb2 DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 1 DEFAULT_MONITOR_INTERVAL_SECONDS = 5 Loading @@ -31,6 +36,9 @@ DEFAULT_MEMORY_USAGE_THRESHOLD = 2000 DEFAULT_CPU_USAGE_THRESHOLD = 200 DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24 BLOCK_SIGN_FILE = "edit_monitor_block_sign" # Enum of the Clearcut log source defined under # /google3/wireless/android/play/playlog/proto/log_source_enum.proto LOG_SOURCE = 2524 def default_daemon_target(): Loading @@ -46,11 +54,16 @@ class DaemonManager: binary_path: str, daemon_target: callable = default_daemon_target, daemon_args: tuple = (), cclient: clearcut_client.Clearcut | None = None, ): self.binary_path = binary_path self.daemon_target = daemon_target self.daemon_args = daemon_args self.cclient = cclient or clearcut_client.Clearcut(LOG_SOURCE) self.user_name = getpass.getuser() self.host_name = platform.node() self.source_root = os.environ.get("ANDROID_BUILD_TOP", "") self.pid = os.getpid() self.daemon_process = None Loading @@ -70,13 +83,20 @@ class DaemonManager: logging.warning("Block sign found, exiting...") return if self.binary_path.startswith('/google/cog/'): if self.binary_path.startswith("/google/cog/"): logging.warning("Edit monitor for cog is not supported, exiting...") return try: self._stop_any_existing_instance() self._write_pid_to_pidfile() self._start_daemon_process() except Exception as e: logging.exception("Failed to start daemon manager with error %s", e) self._send_error_event_to_clearcut( edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR ) raise e def monitor_daemon( self, Loading Loading @@ -118,6 +138,9 @@ class DaemonManager: logging.error( "Daemon process is consuming too much resource, killing..." ), self._send_error_event_to_clearcut( edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_RESOURCE_USAGE ) self._terminate_process(self.daemon_process.pid) logging.info( Loading @@ -143,6 +166,12 @@ class DaemonManager: logging.debug("Successfully stopped daemon manager.") except Exception as e: logging.exception("Failed to stop daemon manager with error %s", e) self._send_error_event_to_clearcut( edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR ) sys.exit(1) finally: self.cclient.flush_events() def reboot(self): """Reboots the current process. Loading @@ -164,6 +193,9 @@ class DaemonManager: os.execv(self.binary_path, sys.argv) except OSError as e: logging.exception("Failed to reboot process with error: %s.", e) self._send_error_event_to_clearcut( edit_event_pb2.EditEvent.FAILED_TO_REBOOT_EDIT_MONITOR ) sys.exit(1) # Indicate an error occurred def cleanup(self): Loading @@ -175,6 +207,7 @@ class DaemonManager: that requires immediate cleanup to prevent damanger to the system. """ logging.debug("Start cleaning up all existing instances.") self._send_error_event_to_clearcut(edit_event_pb2.EditEvent.FORCE_CLEANUP) try: # First places a block sign to prevent any edit monitor process to start. Loading Loading @@ -356,3 +389,18 @@ class DaemonManager: logging.exception("Failed to get pid from file path: %s", file) return pids def _send_error_event_to_clearcut(self, error_type): edit_monitor_error_event_proto = edit_event_pb2.EditEvent( user_name=self.user_name, host_name=self.host_name, source_root=self.source_root, ) edit_monitor_error_event_proto.edit_monitor_error_event.CopyFrom( edit_event_pb2.EditEvent.EditMonitorErrorEvent(error_type=error_type) ) log_event = clientanalytics_pb2.LogEvent( event_time_ms=int(time.time() * 1000), source_extension=edit_monitor_error_event_proto.SerializeToString(), ) self.cclient.log(log_event)
tools/edit_monitor/daemon_manager_test.py +99 −21 Original line number Diff line number Diff line Loading @@ -26,6 +26,7 @@ import time import unittest from unittest import mock from edit_monitor import daemon_manager from proto import edit_event_pb2 TEST_BINARY_FILE = '/path/to/test_binary' Loading Loading @@ -133,7 +134,8 @@ class DaemonManagerTest(unittest.TestCase): def test_start_return_directly_if_in_cog_env(self): dm = daemon_manager.DaemonManager( '/google/cog/cloud/user/workspace/edit_monitor') '/google/cog/cloud/user/workspace/edit_monitor' ) dm.start() # Verify no daemon process is started. self.assertIsNone(dm.daemon_process) Loading @@ -148,9 +150,13 @@ class DaemonManagerTest(unittest.TestCase): with open(pid_file_path_dir.joinpath(TEST_PID_FILE_PATH), 'w') as f: f.write('123456') with self.assertRaises(OSError) as error: dm = daemon_manager.DaemonManager(TEST_BINARY_FILE) fake_cclient = FakeClearcutClient() with self.assertRaises(OSError): dm = daemon_manager.DaemonManager(TEST_BINARY_FILE, cclient=fake_cclient) dm.start() self._assert_error_event_logged( fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR ) def test_start_failed_to_write_pidfile(self): pid_file_path_dir = pathlib.Path(self.working_dir.name).joinpath( Loading @@ -160,40 +166,63 @@ class DaemonManagerTest(unittest.TestCase): # Makes the directory read-only so write pidfile will fail. os.chmod(pid_file_path_dir, 0o555) with self.assertRaises(PermissionError) as error: dm = daemon_manager.DaemonManager(TEST_BINARY_FILE) fake_cclient = FakeClearcutClient() with self.assertRaises(PermissionError): dm = daemon_manager.DaemonManager(TEST_BINARY_FILE, cclient=fake_cclient) dm.start() self._assert_error_event_logged( fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR ) def test_start_failed_to_start_daemon_process(self): with self.assertRaises(TypeError) as error: fake_cclient = FakeClearcutClient() with self.assertRaises(TypeError): dm = daemon_manager.DaemonManager( TEST_BINARY_FILE, daemon_target='wrong_target', daemon_args=(1) TEST_BINARY_FILE, daemon_target='wrong_target', daemon_args=(1), cclient=fake_cclient, ) dm.start() self._assert_error_event_logged( fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR ) def test_monitor_daemon_subprocess_killed_high_memory_usage(self): fake_cclient = FakeClearcutClient() dm = daemon_manager.DaemonManager( TEST_BINARY_FILE, daemon_target=memory_consume_daemon_target, daemon_args=(2,), cclient=fake_cclient, ) dm.start() dm.monitor_daemon(interval=1, memory_threshold=2) self.assertTrue(dm.max_memory_usage >= 2) self.assert_no_subprocess_running() self._assert_error_event_logged( fake_cclient, edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_RESOURCE_USAGE, ) def test_monitor_daemon_subprocess_killed_high_cpu_usage(self): fake_cclient = FakeClearcutClient() dm = daemon_manager.DaemonManager( TEST_BINARY_FILE, daemon_target=cpu_consume_daemon_target, daemon_args=(20,), cclient=fake_cclient, ) dm.start() dm.monitor_daemon(interval=1, cpu_threshold=20) self.assertTrue(dm.max_cpu_usage >= 20) self.assert_no_subprocess_running() self._assert_error_event_logged( fake_cclient, edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_RESOURCE_USAGE, ) @mock.patch('subprocess.check_output') def test_monitor_daemon_failed_does_not_matter(self, mock_output): Loading @@ -207,7 +236,8 @@ class DaemonManagerTest(unittest.TestCase): ) dm = daemon_manager.DaemonManager( binary_file.name, daemon_target=long_running_daemon binary_file.name, daemon_target=long_running_daemon, ) dm.start() dm.monitor_daemon(reboot_timeout=0.5) Loading @@ -226,28 +256,43 @@ class DaemonManagerTest(unittest.TestCase): @mock.patch('os.kill') def test_stop_failed_to_kill_daemon_process(self, mock_kill): mock_kill.side_effect = OSError('Unknown OSError') fake_cclient = FakeClearcutClient() dm = daemon_manager.DaemonManager( TEST_BINARY_FILE, daemon_target=long_running_daemon TEST_BINARY_FILE, daemon_target=long_running_daemon, cclient=fake_cclient, ) with self.assertRaises(SystemExit): dm.start() dm.stop() self.assertTrue(dm.daemon_process.is_alive()) self.assertTrue(dm.pid_file_path.exists()) self._assert_error_event_logged( fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR ) @mock.patch('os.remove') def test_stop_failed_to_remove_pidfile(self, mock_remove): mock_remove.side_effect = OSError('Unknown OSError') fake_cclient = FakeClearcutClient() dm = daemon_manager.DaemonManager( TEST_BINARY_FILE, daemon_target=long_running_daemon TEST_BINARY_FILE, daemon_target=long_running_daemon, cclient=fake_cclient, ) with self.assertRaises(SystemExit): dm.start() dm.stop() self.assert_no_subprocess_running() self.assertTrue(dm.pid_file_path.exists()) self._assert_error_event_logged( fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR ) @mock.patch('os.execv') def test_reboot_success(self, mock_execv): binary_file = tempfile.NamedTemporaryFile( Loading @@ -273,7 +318,7 @@ class DaemonManagerTest(unittest.TestCase): ) dm.start() with self.assertRaises(SystemExit) as cm: with self.assertRaises(SystemExit): dm.reboot() mock_execv.assert_not_called() self.assertEqual(cm.exception.code, 0) Loading @@ -281,18 +326,24 @@ class DaemonManagerTest(unittest.TestCase): @mock.patch('os.execv') def test_reboot_failed(self, mock_execv): mock_execv.side_effect = OSError('Unknown OSError') fake_cclient = FakeClearcutClient() binary_file = tempfile.NamedTemporaryFile( dir=self.working_dir.name, delete=False ) dm = daemon_manager.DaemonManager( binary_file.name, daemon_target=long_running_daemon binary_file.name, daemon_target=long_running_daemon, cclient=fake_cclient, ) dm.start() with self.assertRaises(SystemExit) as cm: with self.assertRaises(SystemExit): dm.reboot() self.assertEqual(cm.exception.code, 1) self._assert_error_event_logged( fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_REBOOT_EDIT_MONITOR ) def assert_run_simple_daemon_success(self): damone_output_file = tempfile.NamedTemporaryFile( Loading Loading @@ -374,6 +425,33 @@ class DaemonManagerTest(unittest.TestCase): f.write(str(p.pid)) return p def _assert_error_event_logged(self, fake_cclient, error_type): error_events = fake_cclient.get_sent_events() self.assertEquals(len(error_events), 1) self.assertEquals( edit_event_pb2.EditEvent.FromString( error_events[0].source_extension ).edit_monitor_error_event.error_type, error_type, ) class FakeClearcutClient: def __init__(self): self.pending_log_events = [] self.sent_log_event = [] def log(self, log_event): self.pending_log_events.append(log_event) def flush_events(self): self.sent_log_event.extend(self.pending_log_events) self.pending_log_events.clear() def get_sent_events(self): return self.sent_log_event + self.pending_log_events if __name__ == '__main__': unittest.main()