Loading bootstat/bootstat.cpp +3 −0 Original line number Diff line number Diff line Loading @@ -304,6 +304,9 @@ const std::map<std::string, int32_t> kBootReasonMap = { {"kernel_panic,init", 158}, {"kernel_panic,oom", 159}, {"kernel_panic,stack", 160}, {"kernel_panic,sysrq,livelock,alarm", 161}, // llkd {"kernel_panic,sysrq,livelock,driver", 162}, // llkd {"kernel_panic,sysrq,livelock,zombie", 163}, // llkd }; // Converts a string value representing the reason the system booted to an Loading llkd/README.md +3 −8 Original line number Diff line number Diff line Loading @@ -53,7 +53,9 @@ on purpose, and llkd effectively sweeps up processes that create these conditions. If the test can, it will reconfigure llkd to expedite the test duration by adjusting the ro.llk.* Android properties. Tests run the D state with some scheduling progress to ensure that ABA checking prevents false triggers. triggers. If 100% reliable ABA on platform, then ro.llk.killtest can be set to false; however this will result in some of the unit tests to panic kernel instead of deal with more graceful kill operation. Android Properties ------------------ Loading Loading @@ -108,13 +110,6 @@ default <empty>, comma separated list of uid numbers or names. Architectural Concerns ---------------------- - Figure out how to communicate the kernel panic better to bootstat canonical boot reason determination. This may require an alteration to bootstat, or some logging from llkd. Would like to see boot reason to be watchdog,livelock as a minimum requirement. Or more specifically would want watchdog,livelock,device or watchdog,livelock,zombie be reported. Currently reports panic,sysrq (user requested panic) or panic depending on system support of pstore. - Create kernel module and associated gTest to actually test panic. - Create gTest to test out blacklist (ro.llk.blacklist.<properties> generally not be inputs). Could require more test-only interfaces to libllkd. Loading llkd/include/llkd.h +2 −0 Original line number Diff line number Diff line Loading @@ -37,6 +37,8 @@ unsigned llkCheckMilliseconds(void); #define KHT_ENABLE_PROPERTY "ro." KHT_ENABLE_WRITEABLE_PROPERTY #define LLK_MLOCKALL_PROPERTY "ro.llk.mlockall" #define LLK_MLOCKALL_DEFAULT true #define LLK_KILLTEST_PROPERTY "ro.llk.killtest" #define LLK_KILLTEST_DEFAULT true #define LLK_TIMEOUT_MS_PROPERTY "ro.llk.timeout_ms" #define KHT_TIMEOUT_PROPERTY "ro.khungtask.timeout" #define LLK_D_TIMEOUT_MS_PROPERTY "ro.llk.D.timeout_ms" Loading llkd/libllkd.cpp +14 −9 Original line number Diff line number Diff line Loading @@ -70,6 +70,7 @@ milliseconds llkCycle; // ms to next thread check bool llkEnable = LLK_ENABLE_DEFAULT; // llk daemon enabled bool llkRunning = false; // thread is running bool llkMlockall = LLK_MLOCKALL_DEFAULT; // run mlocked bool llkTestWithKill = LLK_KILLTEST_DEFAULT; // issue test kills milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT; // default timeout enum { llkStateD, llkStateZ, llkNumStates }; // state indexes milliseconds llkStateTimeoutMs[llkNumStates]; // timeout override for each detection state Loading Loading @@ -292,7 +293,7 @@ struct proc { exeMissingValid(false), cmdlineValid(false), updated(true), killed(false) { killed(!llkTestWithKill) { memset(comm, '\0', sizeof(comm)); setComm(_comm); } Loading Loading @@ -475,8 +476,8 @@ bool llkWriteStringToFileConfirm(const std::string& string, const std::string& f return android::base::Trim(content) == string; } void llkPanicKernel(bool dump, pid_t tid) __noreturn; void llkPanicKernel(bool dump, pid_t tid) { void llkPanicKernel(bool dump, pid_t tid, const char* state) __noreturn; void llkPanicKernel(bool dump, pid_t tid, const char* state) { auto sysrqTriggerFd = llkFileToWriteFd("/proc/sysrq-trigger"); if (sysrqTriggerFd < 0) { // DYB Loading @@ -496,6 +497,8 @@ void llkPanicKernel(bool dump, pid_t tid) { } ::usleep(200000); // let everything settle } llkWriteStringToFile(std::string("SysRq : Trigger a crash : 'livelock,") + state + "'\n", "/dev/kmsg"); android::base::WriteStringToFd("c", sysrqTriggerFd); // NOTREACHED // DYB Loading @@ -507,7 +510,7 @@ void llkPanicKernel(bool dump, pid_t tid) { } void llkAlarmHandler(int) { llkPanicKernel(false, ::getpid()); llkPanicKernel(false, ::getpid(), "alarm"); } milliseconds GetUintProperty(const std::string& key, milliseconds def) { Loading Loading @@ -686,7 +689,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) { (val != procp->nrSwitches)) { procp->nrSwitches = val; procp->count = 0ms; procp->killed = false; procp->killed = !llkTestWithKill; } return; } Loading @@ -700,7 +703,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) { if (schedUpdate != procp->schedUpdate) { procp->schedUpdate = schedUpdate; procp->count = 0ms; procp->killed = false; procp->killed = !llkTestWithKill; } } Loading @@ -709,7 +712,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) { if (static_cast<uint64_t>(val) != procp->nrSwitches) { procp->nrSwitches = val; procp->count = 0ms; procp->killed = false; procp->killed = !llkTestWithKill; } } } Loading @@ -719,6 +722,7 @@ void llkLogConfig(void) { << LLK_ENABLE_PROPERTY "=" << llkFormat(llkEnable) << "\n" << KHT_ENABLE_PROPERTY "=" << llkFormat(khtEnable) << "\n" << LLK_MLOCKALL_PROPERTY "=" << llkFormat(llkMlockall) << "\n" << LLK_KILLTEST_PROPERTY "=" << llkFormat(llkTestWithKill) << "\n" << KHT_TIMEOUT_PROPERTY "=" << llkFormat(khtTimeout) << "\n" << LLK_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkTimeoutMs) << "\n" << LLK_D_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateD]) << "\n" Loading Loading @@ -869,7 +873,7 @@ milliseconds llkCheck(bool checkRunning) { procp->time = utime + stime; if (procp->state != state) { procp->count = 0ms; procp->killed = false; procp->killed = !llkTestWithKill; procp->state = state; } else { procp->count += llkCycle; Loading Loading @@ -973,7 +977,7 @@ milliseconds llkCheck(bool checkRunning) { // We are here because we have confirmed kernel live-lock LOG(ERROR) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->" << pid << "->" << tid << ' ' << procp->getComm() << " [panic]"; llkPanicKernel(true, tid); llkPanicKernel(true, tid, (state == 'Z') ? "zombie" : "driver"); } LOG(VERBOSE) << "+closedir()"; } Loading Loading @@ -1045,6 +1049,7 @@ bool llkInit(const char* threadname) { } khtEnable = android::base::GetBoolProperty(KHT_ENABLE_PROPERTY, khtEnable); llkMlockall = android::base::GetBoolProperty(LLK_MLOCKALL_PROPERTY, llkMlockall); llkTestWithKill = android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, llkTestWithKill); // if LLK_TIMOUT_MS_PROPERTY was not set, we will use a set // KHT_TIMEOUT_PROPERTY as co-operative guidance for the default value. khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout); Loading llkd/llkd.rc +1 −0 Original line number Diff line number Diff line Loading @@ -44,5 +44,6 @@ service llkd /system/bin/llkd user llkd group llkd readproc capabilities KILL IPC_LOCK file /dev/kmsg w file /proc/sysrq-trigger w writepid /dev/cpuset/system-background/tasks Loading
bootstat/bootstat.cpp +3 −0 Original line number Diff line number Diff line Loading @@ -304,6 +304,9 @@ const std::map<std::string, int32_t> kBootReasonMap = { {"kernel_panic,init", 158}, {"kernel_panic,oom", 159}, {"kernel_panic,stack", 160}, {"kernel_panic,sysrq,livelock,alarm", 161}, // llkd {"kernel_panic,sysrq,livelock,driver", 162}, // llkd {"kernel_panic,sysrq,livelock,zombie", 163}, // llkd }; // Converts a string value representing the reason the system booted to an Loading
llkd/README.md +3 −8 Original line number Diff line number Diff line Loading @@ -53,7 +53,9 @@ on purpose, and llkd effectively sweeps up processes that create these conditions. If the test can, it will reconfigure llkd to expedite the test duration by adjusting the ro.llk.* Android properties. Tests run the D state with some scheduling progress to ensure that ABA checking prevents false triggers. triggers. If 100% reliable ABA on platform, then ro.llk.killtest can be set to false; however this will result in some of the unit tests to panic kernel instead of deal with more graceful kill operation. Android Properties ------------------ Loading Loading @@ -108,13 +110,6 @@ default <empty>, comma separated list of uid numbers or names. Architectural Concerns ---------------------- - Figure out how to communicate the kernel panic better to bootstat canonical boot reason determination. This may require an alteration to bootstat, or some logging from llkd. Would like to see boot reason to be watchdog,livelock as a minimum requirement. Or more specifically would want watchdog,livelock,device or watchdog,livelock,zombie be reported. Currently reports panic,sysrq (user requested panic) or panic depending on system support of pstore. - Create kernel module and associated gTest to actually test panic. - Create gTest to test out blacklist (ro.llk.blacklist.<properties> generally not be inputs). Could require more test-only interfaces to libllkd. Loading
llkd/include/llkd.h +2 −0 Original line number Diff line number Diff line Loading @@ -37,6 +37,8 @@ unsigned llkCheckMilliseconds(void); #define KHT_ENABLE_PROPERTY "ro." KHT_ENABLE_WRITEABLE_PROPERTY #define LLK_MLOCKALL_PROPERTY "ro.llk.mlockall" #define LLK_MLOCKALL_DEFAULT true #define LLK_KILLTEST_PROPERTY "ro.llk.killtest" #define LLK_KILLTEST_DEFAULT true #define LLK_TIMEOUT_MS_PROPERTY "ro.llk.timeout_ms" #define KHT_TIMEOUT_PROPERTY "ro.khungtask.timeout" #define LLK_D_TIMEOUT_MS_PROPERTY "ro.llk.D.timeout_ms" Loading
llkd/libllkd.cpp +14 −9 Original line number Diff line number Diff line Loading @@ -70,6 +70,7 @@ milliseconds llkCycle; // ms to next thread check bool llkEnable = LLK_ENABLE_DEFAULT; // llk daemon enabled bool llkRunning = false; // thread is running bool llkMlockall = LLK_MLOCKALL_DEFAULT; // run mlocked bool llkTestWithKill = LLK_KILLTEST_DEFAULT; // issue test kills milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT; // default timeout enum { llkStateD, llkStateZ, llkNumStates }; // state indexes milliseconds llkStateTimeoutMs[llkNumStates]; // timeout override for each detection state Loading Loading @@ -292,7 +293,7 @@ struct proc { exeMissingValid(false), cmdlineValid(false), updated(true), killed(false) { killed(!llkTestWithKill) { memset(comm, '\0', sizeof(comm)); setComm(_comm); } Loading Loading @@ -475,8 +476,8 @@ bool llkWriteStringToFileConfirm(const std::string& string, const std::string& f return android::base::Trim(content) == string; } void llkPanicKernel(bool dump, pid_t tid) __noreturn; void llkPanicKernel(bool dump, pid_t tid) { void llkPanicKernel(bool dump, pid_t tid, const char* state) __noreturn; void llkPanicKernel(bool dump, pid_t tid, const char* state) { auto sysrqTriggerFd = llkFileToWriteFd("/proc/sysrq-trigger"); if (sysrqTriggerFd < 0) { // DYB Loading @@ -496,6 +497,8 @@ void llkPanicKernel(bool dump, pid_t tid) { } ::usleep(200000); // let everything settle } llkWriteStringToFile(std::string("SysRq : Trigger a crash : 'livelock,") + state + "'\n", "/dev/kmsg"); android::base::WriteStringToFd("c", sysrqTriggerFd); // NOTREACHED // DYB Loading @@ -507,7 +510,7 @@ void llkPanicKernel(bool dump, pid_t tid) { } void llkAlarmHandler(int) { llkPanicKernel(false, ::getpid()); llkPanicKernel(false, ::getpid(), "alarm"); } milliseconds GetUintProperty(const std::string& key, milliseconds def) { Loading Loading @@ -686,7 +689,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) { (val != procp->nrSwitches)) { procp->nrSwitches = val; procp->count = 0ms; procp->killed = false; procp->killed = !llkTestWithKill; } return; } Loading @@ -700,7 +703,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) { if (schedUpdate != procp->schedUpdate) { procp->schedUpdate = schedUpdate; procp->count = 0ms; procp->killed = false; procp->killed = !llkTestWithKill; } } Loading @@ -709,7 +712,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) { if (static_cast<uint64_t>(val) != procp->nrSwitches) { procp->nrSwitches = val; procp->count = 0ms; procp->killed = false; procp->killed = !llkTestWithKill; } } } Loading @@ -719,6 +722,7 @@ void llkLogConfig(void) { << LLK_ENABLE_PROPERTY "=" << llkFormat(llkEnable) << "\n" << KHT_ENABLE_PROPERTY "=" << llkFormat(khtEnable) << "\n" << LLK_MLOCKALL_PROPERTY "=" << llkFormat(llkMlockall) << "\n" << LLK_KILLTEST_PROPERTY "=" << llkFormat(llkTestWithKill) << "\n" << KHT_TIMEOUT_PROPERTY "=" << llkFormat(khtTimeout) << "\n" << LLK_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkTimeoutMs) << "\n" << LLK_D_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateD]) << "\n" Loading Loading @@ -869,7 +873,7 @@ milliseconds llkCheck(bool checkRunning) { procp->time = utime + stime; if (procp->state != state) { procp->count = 0ms; procp->killed = false; procp->killed = !llkTestWithKill; procp->state = state; } else { procp->count += llkCycle; Loading Loading @@ -973,7 +977,7 @@ milliseconds llkCheck(bool checkRunning) { // We are here because we have confirmed kernel live-lock LOG(ERROR) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->" << pid << "->" << tid << ' ' << procp->getComm() << " [panic]"; llkPanicKernel(true, tid); llkPanicKernel(true, tid, (state == 'Z') ? "zombie" : "driver"); } LOG(VERBOSE) << "+closedir()"; } Loading Loading @@ -1045,6 +1049,7 @@ bool llkInit(const char* threadname) { } khtEnable = android::base::GetBoolProperty(KHT_ENABLE_PROPERTY, khtEnable); llkMlockall = android::base::GetBoolProperty(LLK_MLOCKALL_PROPERTY, llkMlockall); llkTestWithKill = android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, llkTestWithKill); // if LLK_TIMOUT_MS_PROPERTY was not set, we will use a set // KHT_TIMEOUT_PROPERTY as co-operative guidance for the default value. khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout); Loading
llkd/llkd.rc +1 −0 Original line number Diff line number Diff line Loading @@ -44,5 +44,6 @@ service llkd /system/bin/llkd user llkd group llkd readproc capabilities KILL IPC_LOCK file /dev/kmsg w file /proc/sysrq-trigger w writepid /dev/cpuset/system-background/tasks