mirror of
https://github.com/Fishwaldo/Star64_linux.git
synced 2025-06-22 06:32:08 +00:00
Merge branch 'hwpoison-2.6.32' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6
* 'hwpoison-2.6.32' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6: HWPOISON: fix invalid page count in printk output HWPOISON: Allow schedule_on_each_cpu() from keventd HWPOISON: fix/proc/meminfo alignment HWPOISON: fix oops on ksm pages HWPOISON: Fix page count leak in hwpoison late kill in do_swap_page HWPOISON: return early on non-LRU pages HWPOISON: Add brief hwpoison description to Documentation HWPOISON: Clean up PR_MCE_KILL interface
This commit is contained in:
commit
3242f9804b
6 changed files with 216 additions and 35 deletions
136
Documentation/vm/hwpoison.txt
Normal file
136
Documentation/vm/hwpoison.txt
Normal file
|
@ -0,0 +1,136 @@
|
||||||
|
What is hwpoison?
|
||||||
|
|
||||||
|
Upcoming Intel CPUs have support for recovering from some memory errors
|
||||||
|
(``MCA recovery''). This requires the OS to declare a page "poisoned",
|
||||||
|
kill the processes associated with it and avoid using it in the future.
|
||||||
|
|
||||||
|
This patchkit implements the necessary infrastructure in the VM.
|
||||||
|
|
||||||
|
To quote the overview comment:
|
||||||
|
|
||||||
|
* High level machine check handler. Handles pages reported by the
|
||||||
|
* hardware as being corrupted usually due to a 2bit ECC memory or cache
|
||||||
|
* failure.
|
||||||
|
*
|
||||||
|
* This focusses on pages detected as corrupted in the background.
|
||||||
|
* When the current CPU tries to consume corruption the currently
|
||||||
|
* running process can just be killed directly instead. This implies
|
||||||
|
* that if the error cannot be handled for some reason it's safe to
|
||||||
|
* just ignore it because no corruption has been consumed yet. Instead
|
||||||
|
* when that happens another machine check will happen.
|
||||||
|
*
|
||||||
|
* Handles page cache pages in various states. The tricky part
|
||||||
|
* here is that we can access any page asynchronous to other VM
|
||||||
|
* users, because memory failures could happen anytime and anywhere,
|
||||||
|
* possibly violating some of their assumptions. This is why this code
|
||||||
|
* has to be extremely careful. Generally it tries to use normal locking
|
||||||
|
* rules, as in get the standard locks, even if that means the
|
||||||
|
* error handling takes potentially a long time.
|
||||||
|
*
|
||||||
|
* Some of the operations here are somewhat inefficient and have non
|
||||||
|
* linear algorithmic complexity, because the data structures have not
|
||||||
|
* been optimized for this case. This is in particular the case
|
||||||
|
* for the mapping from a vma to a process. Since this case is expected
|
||||||
|
* to be rare we hope we can get away with this.
|
||||||
|
|
||||||
|
The code consists of a the high level handler in mm/memory-failure.c,
|
||||||
|
a new page poison bit and various checks in the VM to handle poisoned
|
||||||
|
pages.
|
||||||
|
|
||||||
|
The main target right now is KVM guests, but it works for all kinds
|
||||||
|
of applications. KVM support requires a recent qemu-kvm release.
|
||||||
|
|
||||||
|
For the KVM use there was need for a new signal type so that
|
||||||
|
KVM can inject the machine check into the guest with the proper
|
||||||
|
address. This in theory allows other applications to handle
|
||||||
|
memory failures too. The expection is that near all applications
|
||||||
|
won't do that, but some very specialized ones might.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
There are two (actually three) modi memory failure recovery can be in:
|
||||||
|
|
||||||
|
vm.memory_failure_recovery sysctl set to zero:
|
||||||
|
All memory failures cause a panic. Do not attempt recovery.
|
||||||
|
(on x86 this can be also affected by the tolerant level of the
|
||||||
|
MCE subsystem)
|
||||||
|
|
||||||
|
early kill
|
||||||
|
(can be controlled globally and per process)
|
||||||
|
Send SIGBUS to the application as soon as the error is detected
|
||||||
|
This allows applications who can process memory errors in a gentle
|
||||||
|
way (e.g. drop affected object)
|
||||||
|
This is the mode used by KVM qemu.
|
||||||
|
|
||||||
|
late kill
|
||||||
|
Send SIGBUS when the application runs into the corrupted page.
|
||||||
|
This is best for memory error unaware applications and default
|
||||||
|
Note some pages are always handled as late kill.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
User control:
|
||||||
|
|
||||||
|
vm.memory_failure_recovery
|
||||||
|
See sysctl.txt
|
||||||
|
|
||||||
|
vm.memory_failure_early_kill
|
||||||
|
Enable early kill mode globally
|
||||||
|
|
||||||
|
PR_MCE_KILL
|
||||||
|
Set early/late kill mode/revert to system default
|
||||||
|
arg1: PR_MCE_KILL_CLEAR: Revert to system default
|
||||||
|
arg1: PR_MCE_KILL_SET: arg2 defines thread specific mode
|
||||||
|
PR_MCE_KILL_EARLY: Early kill
|
||||||
|
PR_MCE_KILL_LATE: Late kill
|
||||||
|
PR_MCE_KILL_DEFAULT: Use system global default
|
||||||
|
PR_MCE_KILL_GET
|
||||||
|
return current mode
|
||||||
|
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Testing:
|
||||||
|
|
||||||
|
madvise(MADV_POISON, ....)
|
||||||
|
(as root)
|
||||||
|
Poison a page in the process for testing
|
||||||
|
|
||||||
|
|
||||||
|
hwpoison-inject module through debugfs
|
||||||
|
/sys/debug/hwpoison/corrupt-pfn
|
||||||
|
|
||||||
|
Inject hwpoison fault at PFN echoed into this file
|
||||||
|
|
||||||
|
|
||||||
|
Architecture specific MCE injector
|
||||||
|
|
||||||
|
x86 has mce-inject, mce-test
|
||||||
|
|
||||||
|
Some portable hwpoison test programs in mce-test, see blow.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
References:
|
||||||
|
|
||||||
|
http://halobates.de/mce-lc09-2.pdf
|
||||||
|
Overview presentation from LinuxCon 09
|
||||||
|
|
||||||
|
git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
|
||||||
|
Test suite (hwpoison specific portable tests in tsrc)
|
||||||
|
|
||||||
|
git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
|
||||||
|
x86 specific injector
|
||||||
|
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Limitations:
|
||||||
|
|
||||||
|
- Not all page types are supported and never will. Most kernel internal
|
||||||
|
objects cannot be recovered, only LRU pages for now.
|
||||||
|
- Right now hugepage support is missing.
|
||||||
|
|
||||||
|
---
|
||||||
|
Andi Kleen, Oct 2009
|
||||||
|
|
|
@ -88,6 +88,18 @@
|
||||||
#define PR_TASK_PERF_EVENTS_DISABLE 31
|
#define PR_TASK_PERF_EVENTS_DISABLE 31
|
||||||
#define PR_TASK_PERF_EVENTS_ENABLE 32
|
#define PR_TASK_PERF_EVENTS_ENABLE 32
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Set early/late kill mode for hwpoison memory corruption.
|
||||||
|
* This influences when the process gets killed on a memory corruption.
|
||||||
|
*/
|
||||||
#define PR_MCE_KILL 33
|
#define PR_MCE_KILL 33
|
||||||
|
# define PR_MCE_KILL_CLEAR 0
|
||||||
|
# define PR_MCE_KILL_SET 1
|
||||||
|
|
||||||
|
# define PR_MCE_KILL_LATE 0
|
||||||
|
# define PR_MCE_KILL_EARLY 1
|
||||||
|
# define PR_MCE_KILL_DEFAULT 2
|
||||||
|
|
||||||
|
#define PR_MCE_KILL_GET 34
|
||||||
|
|
||||||
#endif /* _LINUX_PRCTL_H */
|
#endif /* _LINUX_PRCTL_H */
|
||||||
|
|
23
kernel/sys.c
23
kernel/sys.c
|
@ -1548,24 +1548,37 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
|
||||||
if (arg4 | arg5)
|
if (arg4 | arg5)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
switch (arg2) {
|
switch (arg2) {
|
||||||
case 0:
|
case PR_MCE_KILL_CLEAR:
|
||||||
if (arg3 != 0)
|
if (arg3 != 0)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
current->flags &= ~PF_MCE_PROCESS;
|
current->flags &= ~PF_MCE_PROCESS;
|
||||||
break;
|
break;
|
||||||
case 1:
|
case PR_MCE_KILL_SET:
|
||||||
current->flags |= PF_MCE_PROCESS;
|
current->flags |= PF_MCE_PROCESS;
|
||||||
if (arg3 != 0)
|
if (arg3 == PR_MCE_KILL_EARLY)
|
||||||
current->flags |= PF_MCE_EARLY;
|
current->flags |= PF_MCE_EARLY;
|
||||||
else
|
else if (arg3 == PR_MCE_KILL_LATE)
|
||||||
current->flags &= ~PF_MCE_EARLY;
|
current->flags &= ~PF_MCE_EARLY;
|
||||||
|
else if (arg3 == PR_MCE_KILL_DEFAULT)
|
||||||
|
current->flags &=
|
||||||
|
~(PF_MCE_EARLY|PF_MCE_PROCESS);
|
||||||
|
else
|
||||||
|
return -EINVAL;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
error = 0;
|
error = 0;
|
||||||
break;
|
break;
|
||||||
|
case PR_MCE_KILL_GET:
|
||||||
|
if (arg2 | arg3 | arg4 | arg5)
|
||||||
|
return -EINVAL;
|
||||||
|
if (current->flags & PF_MCE_PROCESS)
|
||||||
|
error = (current->flags & PF_MCE_EARLY) ?
|
||||||
|
PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
|
||||||
|
else
|
||||||
|
error = PR_MCE_KILL_DEFAULT;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
error = -EINVAL;
|
error = -EINVAL;
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -685,21 +685,38 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
|
||||||
int schedule_on_each_cpu(work_func_t func)
|
int schedule_on_each_cpu(work_func_t func)
|
||||||
{
|
{
|
||||||
int cpu;
|
int cpu;
|
||||||
|
int orig = -1;
|
||||||
struct work_struct *works;
|
struct work_struct *works;
|
||||||
|
|
||||||
works = alloc_percpu(struct work_struct);
|
works = alloc_percpu(struct work_struct);
|
||||||
if (!works)
|
if (!works)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* when running in keventd don't schedule a work item on itself.
|
||||||
|
* Can just call directly because the work queue is already bound.
|
||||||
|
* This also is faster.
|
||||||
|
* Make this a generic parameter for other workqueues?
|
||||||
|
*/
|
||||||
|
if (current_is_keventd()) {
|
||||||
|
orig = raw_smp_processor_id();
|
||||||
|
INIT_WORK(per_cpu_ptr(works, orig), func);
|
||||||
|
func(per_cpu_ptr(works, orig));
|
||||||
|
}
|
||||||
|
|
||||||
get_online_cpus();
|
get_online_cpus();
|
||||||
for_each_online_cpu(cpu) {
|
for_each_online_cpu(cpu) {
|
||||||
struct work_struct *work = per_cpu_ptr(works, cpu);
|
struct work_struct *work = per_cpu_ptr(works, cpu);
|
||||||
|
|
||||||
|
if (cpu == orig)
|
||||||
|
continue;
|
||||||
INIT_WORK(work, func);
|
INIT_WORK(work, func);
|
||||||
schedule_work_on(cpu, work);
|
schedule_work_on(cpu, work);
|
||||||
}
|
}
|
||||||
for_each_online_cpu(cpu)
|
for_each_online_cpu(cpu) {
|
||||||
flush_work(per_cpu_ptr(works, cpu));
|
if (cpu != orig)
|
||||||
|
flush_work(per_cpu_ptr(works, cpu));
|
||||||
|
}
|
||||||
put_online_cpus();
|
put_online_cpus();
|
||||||
free_percpu(works);
|
free_percpu(works);
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -371,9 +371,6 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
|
||||||
int ret = FAILED;
|
int ret = FAILED;
|
||||||
struct address_space *mapping;
|
struct address_space *mapping;
|
||||||
|
|
||||||
if (!isolate_lru_page(p))
|
|
||||||
page_cache_release(p);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* For anonymous pages we're done the only reference left
|
* For anonymous pages we're done the only reference left
|
||||||
* should be the one m_f() holds.
|
* should be the one m_f() holds.
|
||||||
|
@ -499,30 +496,18 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
|
||||||
*/
|
*/
|
||||||
static int me_swapcache_dirty(struct page *p, unsigned long pfn)
|
static int me_swapcache_dirty(struct page *p, unsigned long pfn)
|
||||||
{
|
{
|
||||||
int ret = FAILED;
|
|
||||||
|
|
||||||
ClearPageDirty(p);
|
ClearPageDirty(p);
|
||||||
/* Trigger EIO in shmem: */
|
/* Trigger EIO in shmem: */
|
||||||
ClearPageUptodate(p);
|
ClearPageUptodate(p);
|
||||||
|
|
||||||
if (!isolate_lru_page(p)) {
|
return DELAYED;
|
||||||
page_cache_release(p);
|
|
||||||
ret = DELAYED;
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int me_swapcache_clean(struct page *p, unsigned long pfn)
|
static int me_swapcache_clean(struct page *p, unsigned long pfn)
|
||||||
{
|
{
|
||||||
int ret = FAILED;
|
|
||||||
|
|
||||||
if (!isolate_lru_page(p)) {
|
|
||||||
page_cache_release(p);
|
|
||||||
ret = RECOVERED;
|
|
||||||
}
|
|
||||||
delete_from_swap_cache(p);
|
delete_from_swap_cache(p);
|
||||||
return ret;
|
|
||||||
|
return RECOVERED;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -612,8 +597,6 @@ static struct page_state {
|
||||||
{ 0, 0, "unknown page state", me_unknown },
|
{ 0, 0, "unknown page state", me_unknown },
|
||||||
};
|
};
|
||||||
|
|
||||||
#undef lru
|
|
||||||
|
|
||||||
static void action_result(unsigned long pfn, char *msg, int result)
|
static void action_result(unsigned long pfn, char *msg, int result)
|
||||||
{
|
{
|
||||||
struct page *page = NULL;
|
struct page *page = NULL;
|
||||||
|
@ -630,13 +613,16 @@ static int page_action(struct page_state *ps, struct page *p,
|
||||||
unsigned long pfn, int ref)
|
unsigned long pfn, int ref)
|
||||||
{
|
{
|
||||||
int result;
|
int result;
|
||||||
|
int count;
|
||||||
|
|
||||||
result = ps->action(p, pfn);
|
result = ps->action(p, pfn);
|
||||||
action_result(pfn, ps->msg, result);
|
action_result(pfn, ps->msg, result);
|
||||||
if (page_count(p) != 1 + ref)
|
|
||||||
|
count = page_count(p) - 1 - ref;
|
||||||
|
if (count != 0)
|
||||||
printk(KERN_ERR
|
printk(KERN_ERR
|
||||||
"MCE %#lx: %s page still referenced by %d users\n",
|
"MCE %#lx: %s page still referenced by %d users\n",
|
||||||
pfn, ps->msg, page_count(p) - 1);
|
pfn, ps->msg, count);
|
||||||
|
|
||||||
/* Could do more checks here if page looks ok */
|
/* Could do more checks here if page looks ok */
|
||||||
/*
|
/*
|
||||||
|
@ -665,9 +651,6 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
|
||||||
if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
|
if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (!PageLRU(p))
|
|
||||||
lru_add_drain_all();
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This check implies we don't kill processes if their pages
|
* This check implies we don't kill processes if their pages
|
||||||
* are in the swap cache early. Those are always late kills.
|
* are in the swap cache early. Those are always late kills.
|
||||||
|
@ -739,6 +722,7 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
|
||||||
|
|
||||||
int __memory_failure(unsigned long pfn, int trapno, int ref)
|
int __memory_failure(unsigned long pfn, int trapno, int ref)
|
||||||
{
|
{
|
||||||
|
unsigned long lru_flag;
|
||||||
struct page_state *ps;
|
struct page_state *ps;
|
||||||
struct page *p;
|
struct page *p;
|
||||||
int res;
|
int res;
|
||||||
|
@ -775,6 +759,24 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
|
||||||
return PageBuddy(compound_head(p)) ? 0 : -EBUSY;
|
return PageBuddy(compound_head(p)) ? 0 : -EBUSY;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We ignore non-LRU pages for good reasons.
|
||||||
|
* - PG_locked is only well defined for LRU pages and a few others
|
||||||
|
* - to avoid races with __set_page_locked()
|
||||||
|
* - to avoid races with __SetPageSlab*() (and more non-atomic ops)
|
||||||
|
* The check (unnecessarily) ignores LRU pages being isolated and
|
||||||
|
* walked by the page reclaim code, however that's not a big loss.
|
||||||
|
*/
|
||||||
|
if (!PageLRU(p))
|
||||||
|
lru_add_drain_all();
|
||||||
|
lru_flag = p->flags & lru;
|
||||||
|
if (isolate_lru_page(p)) {
|
||||||
|
action_result(pfn, "non LRU", IGNORED);
|
||||||
|
put_page(p);
|
||||||
|
return -EBUSY;
|
||||||
|
}
|
||||||
|
page_cache_release(p);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Lock the page and wait for writeback to finish.
|
* Lock the page and wait for writeback to finish.
|
||||||
* It's very difficult to mess with pages currently under IO
|
* It's very difficult to mess with pages currently under IO
|
||||||
|
@ -791,7 +793,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
|
||||||
/*
|
/*
|
||||||
* Torn down by someone else?
|
* Torn down by someone else?
|
||||||
*/
|
*/
|
||||||
if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
|
if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) {
|
||||||
action_result(pfn, "already truncated LRU", IGNORED);
|
action_result(pfn, "already truncated LRU", IGNORED);
|
||||||
res = 0;
|
res = 0;
|
||||||
goto out;
|
goto out;
|
||||||
|
@ -799,7 +801,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
|
||||||
|
|
||||||
res = -EBUSY;
|
res = -EBUSY;
|
||||||
for (ps = error_states;; ps++) {
|
for (ps = error_states;; ps++) {
|
||||||
if ((p->flags & ps->mask) == ps->res) {
|
if (((p->flags | lru_flag)& ps->mask) == ps->res) {
|
||||||
res = page_action(ps, p, pfn, ref);
|
res = page_action(ps, p, pfn, ref);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -2542,7 +2542,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
} else if (PageHWPoison(page)) {
|
} else if (PageHWPoison(page)) {
|
||||||
ret = VM_FAULT_HWPOISON;
|
ret = VM_FAULT_HWPOISON;
|
||||||
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
|
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
|
||||||
goto out;
|
goto out_release;
|
||||||
}
|
}
|
||||||
|
|
||||||
lock_page(page);
|
lock_page(page);
|
||||||
|
@ -2614,6 +2614,7 @@ out_nomap:
|
||||||
pte_unmap_unlock(page_table, ptl);
|
pte_unmap_unlock(page_table, ptl);
|
||||||
out_page:
|
out_page:
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
|
out_release:
|
||||||
page_cache_release(page);
|
page_cache_release(page);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue