Loading include/linux/oom.h +6 −2 Original line number Diff line number Diff line Loading @@ -123,14 +123,18 @@ extern void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask); #ifdef CONFIG_HAVE_USERSPACE_LOW_MEMORY_KILLER extern bool should_ulmk_retry(void); extern bool should_ulmk_retry(gfp_t gfp); extern void ulmk_update_last_kill(void); extern void ulmk_watchdog_fn(struct timer_list *t); extern void ulmk_watchdog_pet(struct timer_list *t); #else static inline bool should_ulmk_retry(void) static inline bool should_ulmk_retry(gfp_t gfp) { return false; } static inline void ulmk_update_last_kill(void) {} static inline void ulmk_watchdog_fn(struct timer_list *t) {} static inline void ulmk_watchdog_pet(struct timer_list *t) {} #endif /* sysctls */ Loading include/linux/psi.h +5 −0 Original line number Diff line number Diff line Loading @@ -24,6 +24,7 @@ void psi_memstall_leave(unsigned long *flags); int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); void psi_emergency_trigger(void); bool psi_is_trigger_active(void); #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgrp); Loading @@ -46,6 +47,10 @@ static inline void psi_memstall_enter(unsigned long *flags) {} static inline void psi_memstall_leave(unsigned long *flags) {} static inline void psi_emergency_trigger(void){} static inline bool psi_is_trigger_active(void) { return false; } #ifdef CONFIG_CGROUPS static inline int psi_cgroup_alloc(struct cgroup *cgrp) Loading include/linux/psi_types.h +2 −0 Original line number Diff line number Diff line Loading @@ -6,6 +6,7 @@ #include <linux/types.h> #include <linux/kref.h> #include <linux/wait.h> #include <linux/timer.h> #ifdef CONFIG_PSI Loading Loading @@ -126,6 +127,7 @@ struct psi_trigger { /* Task that created the trigger */ char comm[TASK_COMM_LEN]; struct timer_list wdog_timer; }; struct psi_group { Loading kernel/sched/psi.c +52 −3 Original line number Diff line number Diff line Loading @@ -575,8 +575,12 @@ static u64 update_triggers(struct psi_group *group, u64 now) trace_psi_event(t->state, t->threshold); /* Generate an event */ if (cmpxchg(&t->event, 0, 1) == 0) if (cmpxchg(&t->event, 0, 1) == 0) { if (!strcmp(t->comm, ULMK_MAGIC)) mod_timer(&t->wdog_timer, jiffies + nsecs_to_jiffies(2 * t->win.size)); wake_up_interruptible(&t->event_wait); } t->last_event_time = now; } Loading @@ -588,10 +592,14 @@ static u64 update_triggers(struct psi_group *group, u64 now) return now + group->poll_min_period; } /* * Allows sending more than one event per window. */ void psi_emergency_trigger(void) { struct psi_group *group = &psi_system; struct psi_trigger *t; u64 now; if (static_branch_likely(&psi_disabled)) return; Loading @@ -603,18 +611,54 @@ void psi_emergency_trigger(void) if (!mutex_trylock(&group->trigger_lock)) return; now = sched_clock(); list_for_each_entry(t, &group->triggers, node) { if (strcmp(t->comm, ULMK_MAGIC)) continue; trace_psi_event(t->state, t->threshold); /* Generate an event */ if (cmpxchg(&t->event, 0, 1) == 0) if (cmpxchg(&t->event, 0, 1) == 0) { mod_timer(&t->wdog_timer, (unsigned long)t->win.size); wake_up_interruptible(&t->event_wait); } t->last_event_time = now; } mutex_unlock(&group->trigger_lock); } /* * Return true if any trigger is active. */ bool psi_is_trigger_active(void) { struct psi_group *group = &psi_system; struct psi_trigger *t; bool trigger_active = false; u64 now; if (static_branch_likely(&psi_disabled)) return false; /* * In unlikely case that OOM was triggered while adding/ * removing triggers. */ if (!mutex_trylock(&group->trigger_lock)) return true; now = sched_clock(); list_for_each_entry(t, &group->triggers, node) { if (strcmp(t->comm, ULMK_MAGIC)) continue; if (now <= t->last_event_time + t->win.size) trigger_active = true; } mutex_unlock(&group->trigger_lock); return trigger_active; } /* * Schedule polling if it's not already scheduled. It's safe to call even from * hotpath because even though kthread_queue_delayed_work takes worker->lock Loading Loading @@ -1116,6 +1160,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, init_waitqueue_head(&t->event_wait); kref_init(&t->refcount); get_task_comm(t->comm, current); timer_setup(&t->wdog_timer, ulmk_watchdog_fn, TIMER_DEFERRABLE); mutex_lock(&group->trigger_lock); Loading Loading @@ -1188,6 +1233,7 @@ static void psi_trigger_destroy(struct kref *ref) } } del_timer_sync(&t->wdog_timer); mutex_unlock(&group->trigger_lock); /* Loading Loading @@ -1241,8 +1287,11 @@ __poll_t psi_trigger_poll(void **trigger_ptr, poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) if (cmpxchg(&t->event, 1, 0) == 1) { ret |= EPOLLPRI; if (!strcmp(t->comm, ULMK_MAGIC)) ulmk_watchdog_pet(&t->wdog_timer); } kref_put(&t->refcount, psi_trigger_destroy); Loading mm/oom_kill.c +75 −30 Original line number Diff line number Diff line Loading @@ -77,8 +77,21 @@ DEFINE_MUTEX(oom_lock); */ #ifdef CONFIG_HAVE_USERSPACE_LOW_MEMORY_KILLER /* The maximum amount of time to loop in should_ulmk_retry() */ #define ULMK_TIMEOUT (20 * HZ) #define ULMK_DBG_POLICY_TRIGGER (BIT(0)) #define ULMK_DBG_POLICY_WDOG (BIT(1)) #define ULMK_DBG_POLICY_POSITIVE_ADJ (BIT(2)) #define ULMK_DBG_POLICY_ALL (BIT(3) - 1) static unsigned int ulmk_dbg_policy; module_param(ulmk_dbg_policy, uint, 0644); static atomic64_t ulmk_wdog_expired = ATOMIC64_INIT(0); static atomic64_t ulmk_kill_jiffies = ATOMIC64_INIT(INITIAL_JIFFIES); static unsigned long psi_emergency_jiffies = INITIAL_JIFFIES; /* Prevents contention on the mutex_trylock in psi_emergency_jiffies */ static DEFINE_MUTEX(ulmk_retry_lock); static bool ulmk_kill_possible(void) Loading @@ -105,50 +118,81 @@ static bool ulmk_kill_possible(void) } /* * psi_emergency_jiffies represents the last ULMK emergency event. * Give ULMK a 2 second window to handle this event. * If ULMK has made some progress since then, send another. * Repeat as necessary. * If CONFIG_DEBUG_PANIC_ON_OOM is enabled, attempt to determine *why* * we are in this state. * 1) No events were sent by PSI to userspace * 2) PSI sent an event to userspace, but userspace was not able to * receive the event. Possible causes of this include waiting for a * mutex which is held by a process in direct relcaim. Or the userspace * component has crashed. * 3) Userspace received the event, but decided not to kill anything. */ bool should_ulmk_retry(void) bool should_ulmk_retry(gfp_t gfp_mask) { unsigned long now, last_kill; bool ret = false; bool ret = true; bool wdog_expired, trigger_active; struct oom_control oc = { .zonelist = node_zonelist(first_memory_node, gfp_mask), .nodemask = NULL, .memcg = NULL, .gfp_mask = gfp_mask, .order = 0, /* Also causes check_panic_on_oom not to panic */ .only_positive_adj = true, }; if (!sysctl_panic_on_oom) return false; if (gfp_mask & __GFP_RETRY_MAYFAIL) return false; /* Someone else is already checking. */ if (!mutex_trylock(&ulmk_retry_lock)) return true; mutex_lock(&ulmk_retry_lock); now = jiffies; last_kill = atomic64_read(&ulmk_kill_jiffies); if (time_before(now, psi_emergency_jiffies + 2 * HZ)) { ret = true; goto out; } wdog_expired = atomic64_read(&ulmk_wdog_expired); trigger_active = psi_is_trigger_active(); if (time_after_eq(last_kill, psi_emergency_jiffies)) { if (time_after(last_kill, psi_emergency_jiffies)) { psi_emergency_jiffies = now; psi_emergency_trigger(); ret = true; goto out; } /* * We reached here means no kill have had happened since the last * emergency trigger for 2*HZ window. We can't derive the status * of the low memory killer here. So, before falling back to OOM, * check for any +ve adj tasks left in the system in repeat for * next 20*HZ. Indirectly the below logic also giving 20HZ window * for the first emergency trigger. */ if (time_after(psi_emergency_jiffies + 20 * HZ, now) && ulmk_kill_possible()) { } else if (time_after(now, psi_emergency_jiffies + ULMK_TIMEOUT)) { ret = false; } else if (!trigger_active) { BUG_ON(ulmk_dbg_policy & ULMK_DBG_POLICY_TRIGGER); psi_emergency_trigger(); ret = true; goto out; } else if (wdog_expired) { mutex_lock(&oom_lock); ret = out_of_memory(&oc); mutex_unlock(&oom_lock); BUG_ON(!ret && ulmk_dbg_policy & ULMK_DBG_POLICY_POSITIVE_ADJ); } else if (!ulmk_kill_possible()) { BUG_ON(ulmk_dbg_policy & ULMK_DBG_POLICY_POSITIVE_ADJ); ret = false; } out: mutex_unlock(&ulmk_retry_lock); return ret; } void ulmk_watchdog_fn(struct timer_list *t) { atomic64_set(&ulmk_wdog_expired, 1); BUG_ON(ulmk_dbg_policy & ULMK_DBG_POLICY_WDOG); } void ulmk_watchdog_pet(struct timer_list *t) { del_timer_sync(t); atomic64_set(&ulmk_wdog_expired, 0); } void ulmk_update_last_kill(void) { atomic64_set(&ulmk_kill_jiffies, jiffies); Loading Loading @@ -1143,7 +1187,7 @@ static void check_panic_on_oom(struct oom_control *oc, return; } /* Do not panic for oom kills triggered by sysrq */ if (is_sysrq_oom(oc)) if (is_sysrq_oom(oc) || oc->only_positive_adj) return; dump_header(oc, NULL); panic("Out of memory: %s panic_on_oom is enabled\n", Loading Loading @@ -1244,7 +1288,8 @@ bool out_of_memory(struct oom_control *oc) * system level, we cannot survive this and will enter * an endless loop in the allocator. Bail out now. */ if (!is_sysrq_oom(oc) && !is_memcg_oom(oc)) if (!is_sysrq_oom(oc) && !is_memcg_oom(oc) && !oc->only_positive_adj) panic("System is deadlocked on memory\n"); } if (oc->chosen && oc->chosen != (void *)-1UL) Loading Loading
include/linux/oom.h +6 −2 Original line number Diff line number Diff line Loading @@ -123,14 +123,18 @@ extern void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask); #ifdef CONFIG_HAVE_USERSPACE_LOW_MEMORY_KILLER extern bool should_ulmk_retry(void); extern bool should_ulmk_retry(gfp_t gfp); extern void ulmk_update_last_kill(void); extern void ulmk_watchdog_fn(struct timer_list *t); extern void ulmk_watchdog_pet(struct timer_list *t); #else static inline bool should_ulmk_retry(void) static inline bool should_ulmk_retry(gfp_t gfp) { return false; } static inline void ulmk_update_last_kill(void) {} static inline void ulmk_watchdog_fn(struct timer_list *t) {} static inline void ulmk_watchdog_pet(struct timer_list *t) {} #endif /* sysctls */ Loading
include/linux/psi.h +5 −0 Original line number Diff line number Diff line Loading @@ -24,6 +24,7 @@ void psi_memstall_leave(unsigned long *flags); int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); void psi_emergency_trigger(void); bool psi_is_trigger_active(void); #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgrp); Loading @@ -46,6 +47,10 @@ static inline void psi_memstall_enter(unsigned long *flags) {} static inline void psi_memstall_leave(unsigned long *flags) {} static inline void psi_emergency_trigger(void){} static inline bool psi_is_trigger_active(void) { return false; } #ifdef CONFIG_CGROUPS static inline int psi_cgroup_alloc(struct cgroup *cgrp) Loading
include/linux/psi_types.h +2 −0 Original line number Diff line number Diff line Loading @@ -6,6 +6,7 @@ #include <linux/types.h> #include <linux/kref.h> #include <linux/wait.h> #include <linux/timer.h> #ifdef CONFIG_PSI Loading Loading @@ -126,6 +127,7 @@ struct psi_trigger { /* Task that created the trigger */ char comm[TASK_COMM_LEN]; struct timer_list wdog_timer; }; struct psi_group { Loading
kernel/sched/psi.c +52 −3 Original line number Diff line number Diff line Loading @@ -575,8 +575,12 @@ static u64 update_triggers(struct psi_group *group, u64 now) trace_psi_event(t->state, t->threshold); /* Generate an event */ if (cmpxchg(&t->event, 0, 1) == 0) if (cmpxchg(&t->event, 0, 1) == 0) { if (!strcmp(t->comm, ULMK_MAGIC)) mod_timer(&t->wdog_timer, jiffies + nsecs_to_jiffies(2 * t->win.size)); wake_up_interruptible(&t->event_wait); } t->last_event_time = now; } Loading @@ -588,10 +592,14 @@ static u64 update_triggers(struct psi_group *group, u64 now) return now + group->poll_min_period; } /* * Allows sending more than one event per window. */ void psi_emergency_trigger(void) { struct psi_group *group = &psi_system; struct psi_trigger *t; u64 now; if (static_branch_likely(&psi_disabled)) return; Loading @@ -603,18 +611,54 @@ void psi_emergency_trigger(void) if (!mutex_trylock(&group->trigger_lock)) return; now = sched_clock(); list_for_each_entry(t, &group->triggers, node) { if (strcmp(t->comm, ULMK_MAGIC)) continue; trace_psi_event(t->state, t->threshold); /* Generate an event */ if (cmpxchg(&t->event, 0, 1) == 0) if (cmpxchg(&t->event, 0, 1) == 0) { mod_timer(&t->wdog_timer, (unsigned long)t->win.size); wake_up_interruptible(&t->event_wait); } t->last_event_time = now; } mutex_unlock(&group->trigger_lock); } /* * Return true if any trigger is active. */ bool psi_is_trigger_active(void) { struct psi_group *group = &psi_system; struct psi_trigger *t; bool trigger_active = false; u64 now; if (static_branch_likely(&psi_disabled)) return false; /* * In unlikely case that OOM was triggered while adding/ * removing triggers. */ if (!mutex_trylock(&group->trigger_lock)) return true; now = sched_clock(); list_for_each_entry(t, &group->triggers, node) { if (strcmp(t->comm, ULMK_MAGIC)) continue; if (now <= t->last_event_time + t->win.size) trigger_active = true; } mutex_unlock(&group->trigger_lock); return trigger_active; } /* * Schedule polling if it's not already scheduled. It's safe to call even from * hotpath because even though kthread_queue_delayed_work takes worker->lock Loading Loading @@ -1116,6 +1160,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, init_waitqueue_head(&t->event_wait); kref_init(&t->refcount); get_task_comm(t->comm, current); timer_setup(&t->wdog_timer, ulmk_watchdog_fn, TIMER_DEFERRABLE); mutex_lock(&group->trigger_lock); Loading Loading @@ -1188,6 +1233,7 @@ static void psi_trigger_destroy(struct kref *ref) } } del_timer_sync(&t->wdog_timer); mutex_unlock(&group->trigger_lock); /* Loading Loading @@ -1241,8 +1287,11 @@ __poll_t psi_trigger_poll(void **trigger_ptr, poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) if (cmpxchg(&t->event, 1, 0) == 1) { ret |= EPOLLPRI; if (!strcmp(t->comm, ULMK_MAGIC)) ulmk_watchdog_pet(&t->wdog_timer); } kref_put(&t->refcount, psi_trigger_destroy); Loading
mm/oom_kill.c +75 −30 Original line number Diff line number Diff line Loading @@ -77,8 +77,21 @@ DEFINE_MUTEX(oom_lock); */ #ifdef CONFIG_HAVE_USERSPACE_LOW_MEMORY_KILLER /* The maximum amount of time to loop in should_ulmk_retry() */ #define ULMK_TIMEOUT (20 * HZ) #define ULMK_DBG_POLICY_TRIGGER (BIT(0)) #define ULMK_DBG_POLICY_WDOG (BIT(1)) #define ULMK_DBG_POLICY_POSITIVE_ADJ (BIT(2)) #define ULMK_DBG_POLICY_ALL (BIT(3) - 1) static unsigned int ulmk_dbg_policy; module_param(ulmk_dbg_policy, uint, 0644); static atomic64_t ulmk_wdog_expired = ATOMIC64_INIT(0); static atomic64_t ulmk_kill_jiffies = ATOMIC64_INIT(INITIAL_JIFFIES); static unsigned long psi_emergency_jiffies = INITIAL_JIFFIES; /* Prevents contention on the mutex_trylock in psi_emergency_jiffies */ static DEFINE_MUTEX(ulmk_retry_lock); static bool ulmk_kill_possible(void) Loading @@ -105,50 +118,81 @@ static bool ulmk_kill_possible(void) } /* * psi_emergency_jiffies represents the last ULMK emergency event. * Give ULMK a 2 second window to handle this event. * If ULMK has made some progress since then, send another. * Repeat as necessary. * If CONFIG_DEBUG_PANIC_ON_OOM is enabled, attempt to determine *why* * we are in this state. * 1) No events were sent by PSI to userspace * 2) PSI sent an event to userspace, but userspace was not able to * receive the event. Possible causes of this include waiting for a * mutex which is held by a process in direct relcaim. Or the userspace * component has crashed. * 3) Userspace received the event, but decided not to kill anything. */ bool should_ulmk_retry(void) bool should_ulmk_retry(gfp_t gfp_mask) { unsigned long now, last_kill; bool ret = false; bool ret = true; bool wdog_expired, trigger_active; struct oom_control oc = { .zonelist = node_zonelist(first_memory_node, gfp_mask), .nodemask = NULL, .memcg = NULL, .gfp_mask = gfp_mask, .order = 0, /* Also causes check_panic_on_oom not to panic */ .only_positive_adj = true, }; if (!sysctl_panic_on_oom) return false; if (gfp_mask & __GFP_RETRY_MAYFAIL) return false; /* Someone else is already checking. */ if (!mutex_trylock(&ulmk_retry_lock)) return true; mutex_lock(&ulmk_retry_lock); now = jiffies; last_kill = atomic64_read(&ulmk_kill_jiffies); if (time_before(now, psi_emergency_jiffies + 2 * HZ)) { ret = true; goto out; } wdog_expired = atomic64_read(&ulmk_wdog_expired); trigger_active = psi_is_trigger_active(); if (time_after_eq(last_kill, psi_emergency_jiffies)) { if (time_after(last_kill, psi_emergency_jiffies)) { psi_emergency_jiffies = now; psi_emergency_trigger(); ret = true; goto out; } /* * We reached here means no kill have had happened since the last * emergency trigger for 2*HZ window. We can't derive the status * of the low memory killer here. So, before falling back to OOM, * check for any +ve adj tasks left in the system in repeat for * next 20*HZ. Indirectly the below logic also giving 20HZ window * for the first emergency trigger. */ if (time_after(psi_emergency_jiffies + 20 * HZ, now) && ulmk_kill_possible()) { } else if (time_after(now, psi_emergency_jiffies + ULMK_TIMEOUT)) { ret = false; } else if (!trigger_active) { BUG_ON(ulmk_dbg_policy & ULMK_DBG_POLICY_TRIGGER); psi_emergency_trigger(); ret = true; goto out; } else if (wdog_expired) { mutex_lock(&oom_lock); ret = out_of_memory(&oc); mutex_unlock(&oom_lock); BUG_ON(!ret && ulmk_dbg_policy & ULMK_DBG_POLICY_POSITIVE_ADJ); } else if (!ulmk_kill_possible()) { BUG_ON(ulmk_dbg_policy & ULMK_DBG_POLICY_POSITIVE_ADJ); ret = false; } out: mutex_unlock(&ulmk_retry_lock); return ret; } void ulmk_watchdog_fn(struct timer_list *t) { atomic64_set(&ulmk_wdog_expired, 1); BUG_ON(ulmk_dbg_policy & ULMK_DBG_POLICY_WDOG); } void ulmk_watchdog_pet(struct timer_list *t) { del_timer_sync(t); atomic64_set(&ulmk_wdog_expired, 0); } void ulmk_update_last_kill(void) { atomic64_set(&ulmk_kill_jiffies, jiffies); Loading Loading @@ -1143,7 +1187,7 @@ static void check_panic_on_oom(struct oom_control *oc, return; } /* Do not panic for oom kills triggered by sysrq */ if (is_sysrq_oom(oc)) if (is_sysrq_oom(oc) || oc->only_positive_adj) return; dump_header(oc, NULL); panic("Out of memory: %s panic_on_oom is enabled\n", Loading Loading @@ -1244,7 +1288,8 @@ bool out_of_memory(struct oom_control *oc) * system level, we cannot survive this and will enter * an endless loop in the allocator. Bail out now. */ if (!is_sysrq_oom(oc) && !is_memcg_oom(oc)) if (!is_sysrq_oom(oc) && !is_memcg_oom(oc) && !oc->only_positive_adj) panic("System is deadlocked on memory\n"); } if (oc->chosen && oc->chosen != (void *)-1UL) Loading