Loading mm/memory-failure.c +32 −41 Original line number Diff line number Diff line Loading @@ -7,21 +7,26 @@ * Free Software Foundation. * * High level machine check handler. Handles pages reported by the * hardware as being corrupted usually due to a 2bit ECC memory or cache * hardware as being corrupted usually due to a multi-bit ECC memory or cache * failure. * * In addition there is a "soft offline" entry point that allows stop using * not-yet-corrupted-by-suspicious pages without killing anything. * * Handles page cache pages in various states. The tricky part * here is that we can access any page asynchronous to other VM * users, because memory failures could happen anytime and anywhere, * possibly violating some of their assumptions. This is why this code * has to be extremely careful. Generally it tries to use normal locking * rules, as in get the standard locks, even if that means the * error handling takes potentially a long time. * here is that we can access any page asynchronously in respect to * other VM users, because memory failures could happen anytime and * anywhere. This could violate some of their assumptions. This is why * this code has to be extremely careful. Generally it tries to use * normal locking rules, as in get the standard locks, even if that means * the error handling takes potentially a long time. * * The operation to map back from RMAP chains to processes has to walk * the complete process list and has non linear complexity with the number * mappings. In short it can be quite slow. But since memory corruptions * are rare we hope to get away with this. * There are several operations here with exponential complexity because * of unsuitable VM data structures. For example the operation to map back * from RMAP chains to processes has to walk the complete process list and * has non linear complexity with the number. But since memory corruptions * are rare we hope to get away with this. This avoids impacting the core * VM. */ /* Loading @@ -30,7 +35,6 @@ * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages * - pass bad pages to kdump next kernel */ #define DEBUG 1 /* remove me in 2.6.34 */ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/page-flags.h> Loading Loading @@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p) return 0; /* * page_mapping() does not accept slab page * page_mapping() does not accept slab pages. */ if (PageSlab(p)) return -EINVAL; Loading Loading @@ -268,7 +272,7 @@ struct to_kill { struct list_head nd; struct task_struct *tsk; unsigned long addr; unsigned addr_valid:1; char addr_valid; }; /* Loading Loading @@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * a SIGKILL because the error is not contained anymore. */ if (tk->addr == -EFAULT) { pr_debug("MCE: Unable to find user space address %lx in %s\n", pr_info("MCE: Unable to find user space address %lx in %s\n", page_to_pfn(p), tsk->comm); tk->addr_valid = 0; } Loading Loading @@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) pfn, err); } else if (page_has_private(p) && !try_to_release_page(p, GFP_NOIO)) { pr_debug("MCE %#lx: failed to release buffers\n", pfn); pr_info("MCE %#lx: failed to release buffers\n", pfn); } else { ret = RECOVERED; } Loading Loading @@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p, return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; } #define N_UNMAP_TRIES 5 /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. Loading @@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, struct address_space *mapping; LIST_HEAD(tokill); int ret; int i; int kill = 1; struct page *hpage = compound_head(p); Loading Loading @@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill); /* * try_to_unmap can fail temporarily due to races. * Try a few times (RED-PEN better strategy?) */ for (i = 0; i < N_UNMAP_TRIES; i++) { ret = try_to_unmap(hpage, ttu); if (ret == SWAP_SUCCESS) break; pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); } if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); Loading Loading @@ -1147,7 +1138,7 @@ int unpoison_memory(unsigned long pfn) page = compound_head(p); if (!PageHWPoison(p)) { pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); return 0; } Loading @@ -1156,7 +1147,7 @@ int unpoison_memory(unsigned long pfn) if (!get_page_unless_zero(page)) { if (TestClearPageHWPoison(p)) atomic_long_sub(nr_pages, &mce_bad_pages); pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); return 0; } Loading @@ -1168,7 +1159,7 @@ int unpoison_memory(unsigned long pfn) * the free buddy page pool. */ if (TestClearPageHWPoison(page)) { pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); atomic_long_sub(nr_pages, &mce_bad_pages); freeit = 1; } Loading Loading @@ -1217,12 +1208,12 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) set_migratetype_isolate(p); if (!get_page_unless_zero(compound_head(p))) { if (is_free_buddy_page(p)) { pr_debug("get_any_page: %#lx free buddy page\n", pfn); pr_info("get_any_page: %#lx free buddy page\n", pfn); /* Set hwpoison bit while page is still isolated */ SetPageHWPoison(p); ret = 0; } else { pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", pfn, p->flags); ret = -EIO; } Loading Loading @@ -1288,7 +1279,7 @@ int soft_offline_page(struct page *page, int flags) goto done; } if (!PageLRU(page)) { pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", pfn, page->flags); return -EIO; } Loading @@ -1302,7 +1293,7 @@ int soft_offline_page(struct page *page, int flags) if (PageHWPoison(page)) { unlock_page(page); put_page(page); pr_debug("soft offline: %#lx page already poisoned\n", pfn); pr_info("soft offline: %#lx page already poisoned\n", pfn); return -EBUSY; } Loading @@ -1323,7 +1314,7 @@ int soft_offline_page(struct page *page, int flags) put_page(page); if (ret == 1) { ret = 0; pr_debug("soft_offline: %#lx: invalidated\n", pfn); pr_info("soft_offline: %#lx: invalidated\n", pfn); goto done; } Loading @@ -1339,13 +1330,13 @@ int soft_offline_page(struct page *page, int flags) list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); if (ret) { pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) ret = -EIO; } } else { pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", pfn, ret, page_count(page), page->flags); } if (ret) Loading Loading
mm/memory-failure.c +32 −41 Original line number Diff line number Diff line Loading @@ -7,21 +7,26 @@ * Free Software Foundation. * * High level machine check handler. Handles pages reported by the * hardware as being corrupted usually due to a 2bit ECC memory or cache * hardware as being corrupted usually due to a multi-bit ECC memory or cache * failure. * * In addition there is a "soft offline" entry point that allows stop using * not-yet-corrupted-by-suspicious pages without killing anything. * * Handles page cache pages in various states. The tricky part * here is that we can access any page asynchronous to other VM * users, because memory failures could happen anytime and anywhere, * possibly violating some of their assumptions. This is why this code * has to be extremely careful. Generally it tries to use normal locking * rules, as in get the standard locks, even if that means the * error handling takes potentially a long time. * here is that we can access any page asynchronously in respect to * other VM users, because memory failures could happen anytime and * anywhere. This could violate some of their assumptions. This is why * this code has to be extremely careful. Generally it tries to use * normal locking rules, as in get the standard locks, even if that means * the error handling takes potentially a long time. * * The operation to map back from RMAP chains to processes has to walk * the complete process list and has non linear complexity with the number * mappings. In short it can be quite slow. But since memory corruptions * are rare we hope to get away with this. * There are several operations here with exponential complexity because * of unsuitable VM data structures. For example the operation to map back * from RMAP chains to processes has to walk the complete process list and * has non linear complexity with the number. But since memory corruptions * are rare we hope to get away with this. This avoids impacting the core * VM. */ /* Loading @@ -30,7 +35,6 @@ * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages * - pass bad pages to kdump next kernel */ #define DEBUG 1 /* remove me in 2.6.34 */ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/page-flags.h> Loading Loading @@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p) return 0; /* * page_mapping() does not accept slab page * page_mapping() does not accept slab pages. */ if (PageSlab(p)) return -EINVAL; Loading Loading @@ -268,7 +272,7 @@ struct to_kill { struct list_head nd; struct task_struct *tsk; unsigned long addr; unsigned addr_valid:1; char addr_valid; }; /* Loading Loading @@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * a SIGKILL because the error is not contained anymore. */ if (tk->addr == -EFAULT) { pr_debug("MCE: Unable to find user space address %lx in %s\n", pr_info("MCE: Unable to find user space address %lx in %s\n", page_to_pfn(p), tsk->comm); tk->addr_valid = 0; } Loading Loading @@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) pfn, err); } else if (page_has_private(p) && !try_to_release_page(p, GFP_NOIO)) { pr_debug("MCE %#lx: failed to release buffers\n", pfn); pr_info("MCE %#lx: failed to release buffers\n", pfn); } else { ret = RECOVERED; } Loading Loading @@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p, return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; } #define N_UNMAP_TRIES 5 /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. Loading @@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, struct address_space *mapping; LIST_HEAD(tokill); int ret; int i; int kill = 1; struct page *hpage = compound_head(p); Loading Loading @@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill); /* * try_to_unmap can fail temporarily due to races. * Try a few times (RED-PEN better strategy?) */ for (i = 0; i < N_UNMAP_TRIES; i++) { ret = try_to_unmap(hpage, ttu); if (ret == SWAP_SUCCESS) break; pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); } if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); Loading Loading @@ -1147,7 +1138,7 @@ int unpoison_memory(unsigned long pfn) page = compound_head(p); if (!PageHWPoison(p)) { pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); return 0; } Loading @@ -1156,7 +1147,7 @@ int unpoison_memory(unsigned long pfn) if (!get_page_unless_zero(page)) { if (TestClearPageHWPoison(p)) atomic_long_sub(nr_pages, &mce_bad_pages); pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); return 0; } Loading @@ -1168,7 +1159,7 @@ int unpoison_memory(unsigned long pfn) * the free buddy page pool. */ if (TestClearPageHWPoison(page)) { pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); atomic_long_sub(nr_pages, &mce_bad_pages); freeit = 1; } Loading Loading @@ -1217,12 +1208,12 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) set_migratetype_isolate(p); if (!get_page_unless_zero(compound_head(p))) { if (is_free_buddy_page(p)) { pr_debug("get_any_page: %#lx free buddy page\n", pfn); pr_info("get_any_page: %#lx free buddy page\n", pfn); /* Set hwpoison bit while page is still isolated */ SetPageHWPoison(p); ret = 0; } else { pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", pfn, p->flags); ret = -EIO; } Loading Loading @@ -1288,7 +1279,7 @@ int soft_offline_page(struct page *page, int flags) goto done; } if (!PageLRU(page)) { pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", pfn, page->flags); return -EIO; } Loading @@ -1302,7 +1293,7 @@ int soft_offline_page(struct page *page, int flags) if (PageHWPoison(page)) { unlock_page(page); put_page(page); pr_debug("soft offline: %#lx page already poisoned\n", pfn); pr_info("soft offline: %#lx page already poisoned\n", pfn); return -EBUSY; } Loading @@ -1323,7 +1314,7 @@ int soft_offline_page(struct page *page, int flags) put_page(page); if (ret == 1) { ret = 0; pr_debug("soft_offline: %#lx: invalidated\n", pfn); pr_info("soft_offline: %#lx: invalidated\n", pfn); goto done; } Loading @@ -1339,13 +1330,13 @@ int soft_offline_page(struct page *page, int flags) list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); if (ret) { pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) ret = -EIO; } } else { pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", pfn, ret, page_count(page), page->flags); } if (ret) Loading