Merge commit 'v2.6.36-rc7' into core/memblock

Merge reason: Update from -rc3 to -rc7.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
Ingo Molnar 2010-10-08 09:14:51 +02:00
commit 153db80f8c
964 changed files with 10076 additions and 5634 deletions

View file

@ -189,7 +189,7 @@ config COMPACTION
config MIGRATION
bool "Page migration"
def_bool y
depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE
depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION
help
Allows the migration of the physical location of pages of processes
while the virtual addresses are not changed. This is useful in

View file

@ -30,6 +30,7 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info);
struct backing_dev_info noop_backing_dev_info = {
.name = "noop",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
@ -243,6 +244,7 @@ static int __init default_bdi_init(void)
err = bdi_init(&default_backing_dev_info);
if (!err)
bdi_register(&default_backing_dev_info, NULL, "default");
err = bdi_init(&noop_backing_dev_info);
return err;
}
@ -445,8 +447,8 @@ static int bdi_forker_thread(void *ptr)
switch (action) {
case FORK_THREAD:
__set_current_state(TASK_RUNNING);
task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s",
dev_name(bdi->dev));
task = kthread_create(bdi_writeback_thread, &bdi->wb,
"flush-%s", dev_name(bdi->dev));
if (IS_ERR(task)) {
/*
* If thread creation fails, force writeout of
@ -457,10 +459,13 @@ static int bdi_forker_thread(void *ptr)
/*
* The spinlock makes sure we do not lose
* wake-ups when racing with 'bdi_queue_work()'.
* And as soon as the bdi thread is visible, we
* can start it.
*/
spin_lock_bh(&bdi->wb_lock);
bdi->wb.task = task;
spin_unlock_bh(&bdi->wb_lock);
wake_up_process(task);
}
break;

View file

@ -116,8 +116,8 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
*/
vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
flush_dcache_page(tovec->bv_page);
bounce_copy_vec(tovec, vfrom);
flush_dcache_page(tovec->bv_page);
}
}

View file

@ -214,15 +214,16 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
/* Similar to reclaim, but different enough that they don't share logic */
static bool too_many_isolated(struct zone *zone)
{
unsigned long inactive, isolated;
unsigned long active, inactive, isolated;
inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
zone_page_state(zone, NR_INACTIVE_ANON);
active = zone_page_state(zone, NR_ACTIVE_FILE) +
zone_page_state(zone, NR_ACTIVE_ANON);
isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
zone_page_state(zone, NR_ISOLATED_ANON);
return isolated > inactive;
return isolated > (inactive + active) / 2;
}
/*

View file

@ -125,7 +125,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
{
struct mm_struct *mm = current->mm;
struct address_space *mapping;
unsigned long end = start + size;
struct vm_area_struct *vma;
int err = -EINVAL;
int has_write_lock = 0;
@ -142,6 +141,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (start + size <= start)
return err;
/* Does pgoff wrap? */
if (pgoff + (size >> PAGE_SHIFT) < pgoff)
return err;
/* Can we represent this offset inside this architecture's pte's? */
#if PTE_FILE_MAX_BITS < BITS_PER_LONG
if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
@ -168,7 +171,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (!(vma->vm_flags & VM_CAN_NONLINEAR))
goto out;
if (end <= start || start < vma->vm_start || end > vma->vm_end)
if (start < vma->vm_start || start + size > vma->vm_end)
goto out;
/* Must set VM_NONLINEAR before any pages are populated. */

View file

@ -2324,11 +2324,8 @@ retry_avoidcopy:
* and just make the page writable */
avoidcopy = (page_mapcount(old_page) == 1);
if (avoidcopy) {
if (!trylock_page(old_page)) {
if (PageAnon(old_page))
page_move_anon_rmap(old_page, vma, address);
} else
unlock_page(old_page);
if (PageAnon(old_page))
page_move_anon_rmap(old_page, vma, address);
set_huge_ptep_writable(vma, address, ptep);
return 0;
}
@ -2404,7 +2401,7 @@ retry_avoidcopy:
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page);
hugepage_add_anon_rmap(new_page, vma, address);
hugepage_add_new_anon_rmap(new_page, vma, address);
/* Make the old page be freed below */
new_page = old_page;
mmu_notifier_invalidate_range_end(mm,
@ -2631,10 +2628,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vma, address);
}
if (!pagecache_page) {
page = pte_page(entry);
/*
* hugetlb_cow() requires page locks of pte_page(entry) and
* pagecache_page, so here we need take the former one
* when page != pagecache_page or !pagecache_page.
* Note that locking order is always pagecache_page -> page,
* so no worry about deadlock.
*/
page = pte_page(entry);
if (page != pagecache_page)
lock_page(page);
}
spin_lock(&mm->page_table_lock);
/* Check for a racing update before calling hugetlb_cow */
@ -2661,9 +2664,8 @@ out_page_table_lock:
if (pagecache_page) {
unlock_page(pagecache_page);
put_page(pagecache_page);
} else {
unlock_page(page);
}
unlock_page(page);
out_mutex:
mutex_unlock(&hugetlb_instantiation_mutex);

View file

@ -712,7 +712,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
if (!ptep)
goto out;
if (pte_write(*ptep)) {
if (pte_write(*ptep) || pte_dirty(*ptep)) {
pte_t entry;
swapped = PageSwapCache(page);
@ -735,7 +735,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
set_pte_at(mm, addr, ptep, entry);
goto out_unlock;
}
entry = pte_wrprotect(entry);
if (pte_dirty(entry))
set_page_dirty(page);
entry = pte_mkclean(pte_wrprotect(entry));
set_pte_at_notify(mm, addr, ptep, entry);
}
*orig_pte = *ptep;
@ -1504,8 +1506,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
{
struct page *new_page;
unlock_page(page); /* any racers will COW it, not modify it */
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (new_page) {
copy_user_highpage(new_page, page, address, vma);
@ -1521,7 +1521,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
add_page_to_unevictable_list(new_page);
}
page_cache_release(page);
return new_page;
}

View file

@ -2623,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned int flags, pte_t orig_pte)
{
spinlock_t *ptl;
struct page *page;
struct page *page, *swapcache = NULL;
swp_entry_t entry;
pte_t pte;
struct mem_cgroup *ptr = NULL;
@ -2679,10 +2679,25 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
lock_page(page);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
page = ksm_might_need_to_copy(page, vma, address);
if (!page) {
ret = VM_FAULT_OOM;
goto out;
/*
* Make sure try_to_free_swap or reuse_swap_page or swapoff did not
* release the swapcache from under us. The page pin, and pte_same
* test below, are not enough to exclude that. Even if it is still
* swapcache, we need to check that the page's swap has not changed.
*/
if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
goto out_page;
if (ksm_might_need_to_copy(page, vma, address)) {
swapcache = page;
page = ksm_does_need_to_copy(page, vma, address);
if (unlikely(!page)) {
ret = VM_FAULT_OOM;
page = swapcache;
swapcache = NULL;
goto out_page;
}
}
if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
@ -2735,6 +2750,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
if (swapcache) {
/*
* Hold the lock to avoid the swap entry to be reused
* until we take the PT lock for the pte_same() check
* (to avoid false positives from pte_same). For
* further safety release the lock after the swap_free
* so that the swap count won't change under a
* parallel locked swapcache.
*/
unlock_page(swapcache);
page_cache_release(swapcache);
}
if (flags & FAULT_FLAG_WRITE) {
ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
@ -2756,6 +2783,10 @@ out_page:
unlock_page(page);
out_release:
page_cache_release(page);
if (swapcache) {
unlock_page(swapcache);
page_cache_release(swapcache);
}
return ret;
}

View file

@ -584,19 +584,19 @@ static inline int pageblock_free(struct page *page)
/* Return the start of the next active pageblock after a given page */
static struct page *next_active_pageblock(struct page *page)
{
int pageblocks_stride;
/* Ensure the starting page is pageblock-aligned */
BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
/* Move forward by at least 1 * pageblock_nr_pages */
pageblocks_stride = 1;
/* If the entire pageblock is free, move to the end of free page */
if (pageblock_free(page))
pageblocks_stride += page_order(page) - pageblock_order;
if (pageblock_free(page)) {
int order;
/* be careful. we don't have locks, page_order can be changed.*/
order = page_order(page);
if ((order < MAX_ORDER) && (order >= pageblock_order))
return page + (1 << order);
}
return page + (pageblocks_stride * pageblock_nr_pages);
return page + pageblock_nr_pages;
}
/* Checks if this range of memory is likely to be hot-removable. */

View file

@ -135,12 +135,6 @@ void munlock_vma_page(struct page *page)
}
}
/* Is the vma a continuation of the stack vma above it? */
static inline int vma_stack_continue(struct vm_area_struct *vma, unsigned long addr)
{
return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
}
static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
{
return (vma->vm_flags & VM_GROWSDOWN) &&

View file

@ -2009,6 +2009,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
removed_exe_file_vma(mm);
fput(new->vm_file);
}
unlink_anon_vmas(new);
out_free_mpol:
mpol_put(pol);
out_free_vma:

View file

@ -87,3 +87,24 @@ int memmap_valid_within(unsigned long pfn,
return 1;
}
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
#ifdef CONFIG_SMP
/* Called when a more accurate view of NR_FREE_PAGES is needed */
unsigned long zone_nr_free_pages(struct zone *zone)
{
unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
/*
* While kswapd is awake, it is considered the zone is under some
* memory pressure. Under pressure, there is a risk that
* per-cpu-counter-drift will allow the min watermark to be breached
* potentially causing a live-lock. While kswapd is awake and
* free pages are low, get a better estimate for free pages
*/
if (nr_free_pages < zone->percpu_drift_mark &&
!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
return zone_page_state_snapshot(zone, NR_FREE_PAGES);
return nr_free_pages;
}
#endif /* CONFIG_SMP */

View file

@ -121,8 +121,8 @@ struct task_struct *find_lock_task_mm(struct task_struct *p)
}
/* return true if the task is not adequate as candidate victim task. */
static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
const nodemask_t *nodemask)
static bool oom_unkillable_task(struct task_struct *p,
const struct mem_cgroup *mem, const nodemask_t *nodemask)
{
if (is_global_init(p))
return true;
@ -208,8 +208,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
*/
points += p->signal->oom_score_adj;
if (points < 0)
return 0;
/*
* Never return 0 for an eligible task that may be killed since it's
* possible that no single user task uses more than 0.1% of memory and
* no single admin tasks uses more than 3.0%.
*/
if (points <= 0)
return 1;
return (points < 1000) ? points : 1000;
}
@ -339,26 +344,24 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
/**
* dump_tasks - dump current memory state of all system tasks
* @mem: current's memory controller, if constrained
* @nodemask: nodemask passed to page allocator for mempolicy ooms
*
* Dumps the current memory state of all system tasks, excluding kernel threads.
* Dumps the current memory state of all eligible tasks. Tasks not in the same
* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
* are not shown.
* State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
* value, oom_score_adj value, and name.
*
* If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
* shown.
*
* Call with tasklist_lock read-locked.
*/
static void dump_tasks(const struct mem_cgroup *mem)
static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)
{
struct task_struct *p;
struct task_struct *task;
pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
for_each_process(p) {
if (p->flags & PF_KTHREAD)
continue;
if (mem && !task_in_mem_cgroup(p, mem))
if (oom_unkillable_task(p, mem, nodemask))
continue;
task = find_lock_task_mm(p);
@ -381,7 +384,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
}
static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
struct mem_cgroup *mem)
struct mem_cgroup *mem, const nodemask_t *nodemask)
{
task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
@ -394,7 +397,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
mem_cgroup_print_oom_info(mem, p);
show_mem();
if (sysctl_oom_dump_tasks)
dump_tasks(mem);
dump_tasks(mem, nodemask);
}
#define K(x) ((x) << (PAGE_SHIFT-10))
@ -436,7 +439,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
unsigned int victim_points = 0;
if (printk_ratelimit())
dump_header(p, gfp_mask, order, mem);
dump_header(p, gfp_mask, order, mem, nodemask);
/*
* If the task is already exiting, don't alarm the sysadmin or kill
@ -482,7 +485,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
int order)
int order, const nodemask_t *nodemask)
{
if (likely(!sysctl_panic_on_oom))
return;
@ -496,7 +499,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
return;
}
read_lock(&tasklist_lock);
dump_header(NULL, gfp_mask, order, NULL);
dump_header(NULL, gfp_mask, order, NULL, nodemask);
read_unlock(&tasklist_lock);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
@ -509,7 +512,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
unsigned int points = 0;
struct task_struct *p;
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
read_lock(&tasklist_lock);
retry:
@ -641,6 +644,7 @@ static void clear_system_oom(void)
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *nodemask)
{
const nodemask_t *mpol_mask;
struct task_struct *p;
unsigned long totalpages;
unsigned long freed = 0;
@ -670,7 +674,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
*/
constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
&totalpages);
check_panic_on_oom(constraint, gfp_mask, order);
mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
read_lock(&tasklist_lock);
if (sysctl_oom_kill_allocating_task &&
@ -688,15 +693,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
}
retry:
p = select_bad_process(&points, totalpages, NULL,
constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
NULL);
p = select_bad_process(&points, totalpages, NULL, mpol_mask);
if (PTR_ERR(p) == -1UL)
goto out;
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
dump_header(NULL, gfp_mask, order, NULL);
dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
read_unlock(&tasklist_lock);
panic("Out of memory and no killable processes...\n");
}

View file

@ -589,13 +589,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
{
int migratetype = 0;
int batch_free = 0;
int to_free = count;
spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
__mod_zone_page_state(zone, NR_FREE_PAGES, count);
while (count) {
while (to_free) {
struct page *page;
struct list_head *list;
@ -620,8 +620,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
__free_one_page(page, zone, 0, page_private(page));
trace_mm_page_pcpu_drain(page, 0, page_private(page));
} while (--count && --batch_free && !list_empty(list));
} while (--to_free && --batch_free && !list_empty(list));
}
__mod_zone_page_state(zone, NR_FREE_PAGES, count);
spin_unlock(&zone->lock);
}
@ -632,8 +633,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
__free_one_page(page, zone, order, migratetype);
__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
spin_unlock(&zone->lock);
}
@ -1462,7 +1463,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
{
/* free_pages my go negative - that's OK */
long min = mark;
long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
int o;
if (alloc_flags & ALLOC_HIGH)
@ -1847,6 +1848,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL;
struct reclaim_state reclaim_state;
struct task_struct *p = current;
bool drained = false;
cond_resched();
@ -1865,14 +1867,25 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
cond_resched();
if (order != 0)
drain_all_pages();
if (unlikely(!(*did_some_progress)))
return NULL;
if (likely(*did_some_progress))
page = get_page_from_freelist(gfp_mask, nodemask, order,
retry:
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
alloc_flags, preferred_zone,
migratetype);
/*
* If an allocation failed after direct reclaim, it could be because
* pages are pinned on the per-cpu lists. Drain them and try again
*/
if (!page && !drained) {
drain_all_pages();
drained = true;
goto retry;
}
return page;
}
@ -2424,7 +2437,7 @@ void show_free_areas(void)
" all_unreclaimable? %s"
"\n",
zone->name,
K(zone_page_state(zone, NR_FREE_PAGES)),
K(zone_nr_free_pages(zone)),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),

View file

@ -393,7 +393,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
goto out_unlock;
old_size = chunk->map_alloc * sizeof(chunk->map[0]);
memcpy(new, chunk->map, old_size);
old = chunk->map;
memcpy(new, old, old_size);
chunk->map_alloc = new_alloc;
chunk->map = new;
@ -1162,7 +1164,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
}
/*
* Don't accept if wastage is over 25%. The
* Don't accept if wastage is over 1/3. The
* greater-than comparison ensures upa==1 always
* passes the following check.
*/
@ -1399,9 +1401,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
if (pcpu_first_unit_cpu == NR_CPUS)
pcpu_first_unit_cpu = cpu;
pcpu_last_unit_cpu = cpu;
}
}
pcpu_last_unit_cpu = cpu;
pcpu_nr_units = unit;
for_each_possible_cpu(cpu)

View file

@ -14,13 +14,13 @@ void __percpu *__alloc_percpu(size_t size, size_t align)
* percpu sections on SMP for which this path isn't used.
*/
WARN_ON_ONCE(align > SMP_CACHE_BYTES);
return kzalloc(size, GFP_KERNEL);
return (void __percpu __force *)kzalloc(size, GFP_KERNEL);
}
EXPORT_SYMBOL_GPL(__alloc_percpu);
void free_percpu(void __percpu *p)
{
kfree(p);
kfree(this_cpu_ptr(p));
}
EXPORT_SYMBOL_GPL(free_percpu);

View file

@ -381,7 +381,13 @@ vma_address(struct page *page, struct vm_area_struct *vma)
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
if (PageAnon(page)) {
if (vma->anon_vma->root != page_anon_vma(page)->root)
struct anon_vma *page__anon_vma = page_anon_vma(page);
/*
* Note: swapoff's unuse_vma() is more efficient with this
* check, and needs it to match anon_vma when KSM is active.
*/
if (!vma->anon_vma || !page__anon_vma ||
vma->anon_vma->root != page__anon_vma->root)
return -EFAULT;
} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
if (!vma->vm_file ||
@ -1564,13 +1570,14 @@ static void __hugepage_set_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address, int exclusive)
{
struct anon_vma *anon_vma = vma->anon_vma;
BUG_ON(!anon_vma);
if (!exclusive) {
struct anon_vma_chain *avc;
avc = list_entry(vma->anon_vma_chain.prev,
struct anon_vma_chain, same_vma);
anon_vma = avc->anon_vma;
}
if (PageAnon(page))
return;
if (!exclusive)
anon_vma = anon_vma->root;
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
page->index = linear_page_index(vma, address);
@ -1581,6 +1588,8 @@ void hugepage_add_anon_rmap(struct page *page,
{
struct anon_vma *anon_vma = vma->anon_vma;
int first;
BUG_ON(!PageLocked(page));
BUG_ON(!anon_vma);
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
first = atomic_inc_and_test(&page->_mapcount);

View file

@ -47,8 +47,6 @@ long nr_swap_pages;
long total_swap_pages;
static int least_priority;
static bool swap_for_hibernation;
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
@ -141,8 +139,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
if (nr_blocks) {
err = blkdev_issue_discard(si->bdev, start_block,
nr_blocks, GFP_KERNEL,
BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
if (err)
return err;
cond_resched();
@ -153,8 +150,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
err = blkdev_issue_discard(si->bdev, start_block,
nr_blocks, GFP_KERNEL,
BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
if (err)
break;
@ -193,8 +189,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
start_block <<= PAGE_SHIFT - 9;
nr_blocks <<= PAGE_SHIFT - 9;
if (blkdev_issue_discard(si->bdev, start_block,
nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT |
BLKDEV_IFL_BARRIER))
nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT))
break;
}
@ -320,10 +315,8 @@ checks:
if (offset > si->highest_bit)
scan_base = offset = si->lowest_bit;
/* reuse swap entry of cache-only swap if not hibernation. */
if (vm_swap_full()
&& usage == SWAP_HAS_CACHE
&& si->swap_map[offset] == SWAP_HAS_CACHE) {
/* reuse swap entry of cache-only swap if not busy. */
if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
int swap_was_freed;
spin_unlock(&swap_lock);
swap_was_freed = __try_to_reclaim_swap(si, offset);
@ -453,8 +446,6 @@ swp_entry_t get_swap_page(void)
spin_lock(&swap_lock);
if (nr_swap_pages <= 0)
goto noswap;
if (swap_for_hibernation)
goto noswap;
nr_swap_pages--;
for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
@ -487,6 +478,28 @@ noswap:
return (swp_entry_t) {0};
}
/* The only caller of this function is now susupend routine */
swp_entry_t get_swap_page_of_type(int type)
{
struct swap_info_struct *si;
pgoff_t offset;
spin_lock(&swap_lock);
si = swap_info[type];
if (si && (si->flags & SWP_WRITEOK)) {
nr_swap_pages--;
/* This is called for allocating swap entry, not cache */
offset = scan_swap_map(si, 1);
if (offset) {
spin_unlock(&swap_lock);
return swp_entry(type, offset);
}
nr_swap_pages++;
}
spin_unlock(&swap_lock);
return (swp_entry_t) {0};
}
static struct swap_info_struct *swap_info_get(swp_entry_t entry)
{
struct swap_info_struct *p;
@ -670,6 +683,24 @@ int try_to_free_swap(struct page *page)
if (page_swapcount(page))
return 0;
/*
* Once hibernation has begun to create its image of memory,
* there's a danger that one of the calls to try_to_free_swap()
* - most probably a call from __try_to_reclaim_swap() while
* hibernation is allocating its own swap pages for the image,
* but conceivably even a call from memory reclaim - will free
* the swap from a page which has already been recorded in the
* image as a clean swapcache page, and then reuse its swap for
* another page of the image. On waking from hibernation, the
* original page might be freed under memory pressure, then
* later read back in from swap, now with the wrong data.
*
* Hibernation clears bits from gfp_allowed_mask to prevent
* memory reclaim from writing to disk, so check that here.
*/
if (!(gfp_allowed_mask & __GFP_IO))
return 0;
delete_from_swap_cache(page);
SetPageDirty(page);
return 1;
@ -746,74 +777,6 @@ int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
#endif
#ifdef CONFIG_HIBERNATION
static pgoff_t hibernation_offset[MAX_SWAPFILES];
/*
* Once hibernation starts to use swap, we freeze swap_map[]. Otherwise,
* saved swap_map[] image to the disk will be an incomplete because it's
* changing without synchronization with hibernation snap shot.
* At resume, we just make swap_for_hibernation=false. We can forget
* used maps easily.
*/
void hibernation_freeze_swap(void)
{
int i;
spin_lock(&swap_lock);
printk(KERN_INFO "PM: Freeze Swap\n");
swap_for_hibernation = true;
for (i = 0; i < MAX_SWAPFILES; i++)
hibernation_offset[i] = 1;
spin_unlock(&swap_lock);
}
void hibernation_thaw_swap(void)
{
spin_lock(&swap_lock);
if (swap_for_hibernation) {
printk(KERN_INFO "PM: Thaw Swap\n");
swap_for_hibernation = false;
}
spin_unlock(&swap_lock);
}
/*
* Because updateing swap_map[] can make not-saved-status-change,
* we use our own easy allocator.
* Please see kernel/power/swap.c, Used swaps are recorded into
* RB-tree.
*/
swp_entry_t get_swap_for_hibernation(int type)
{
pgoff_t off;
swp_entry_t val = {0};
struct swap_info_struct *si;
spin_lock(&swap_lock);
si = swap_info[type];
if (!si || !(si->flags & SWP_WRITEOK))
goto done;
for (off = hibernation_offset[type]; off < si->max; ++off) {
if (!si->swap_map[off])
break;
}
if (off < si->max) {
val = swp_entry(type, off);
hibernation_offset[type] = off + 1;
}
done:
spin_unlock(&swap_lock);
return val;
}
void swap_free_for_hibernation(swp_entry_t ent)
{
/* Nothing to do */
}
/*
* Find the swap type that corresponds to given device (if any).
*
@ -2084,7 +2047,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->flags |= SWP_SOLIDSTATE;
p->cluster_next = 1 + (random32() % p->highest_bit);
}
if (discard_swap(p) == 0)
if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
p->flags |= SWP_DISCARDABLE;
}

View file

@ -1804,12 +1804,11 @@ static void shrink_zone(int priority, struct zone *zone,
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
static bool shrink_zones(int priority, struct zonelist *zonelist,
static void shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
bool all_unreclaimable = true;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
@ -1827,8 +1826,38 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
}
shrink_zone(priority, zone, sc);
all_unreclaimable = false;
}
}
static bool zone_reclaimable(struct zone *zone)
{
return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
}
/*
* As hibernation is going on, kswapd is freezed so that it can't mark
* the zone into all_unreclaimable. It can't handle OOM during hibernation.
* So let's check zone's unreclaimable in direct reclaim as well as kswapd.
*/
static bool all_unreclaimable(struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
bool all_unreclaimable = true;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!populated_zone(zone))
continue;
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
if (zone_reclaimable(zone)) {
all_unreclaimable = false;
break;
}
}
return all_unreclaimable;
}
@ -1852,7 +1881,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
int priority;
bool all_unreclaimable;
unsigned long total_scanned = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct zoneref *z;
@ -1869,7 +1897,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
sc->nr_scanned = 0;
if (!priority)
disable_swap_token();
all_unreclaimable = shrink_zones(priority, zonelist, sc);
shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
@ -1931,7 +1959,7 @@ out:
return sc->nr_reclaimed;
/* top priority shrink_zones still had more to do? don't OOM, then */
if (scanning_global_lru(sc) && !all_unreclaimable)
if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
return 1;
return 0;
@ -2197,8 +2225,7 @@ loop_again:
total_scanned += sc.nr_scanned;
if (zone->all_unreclaimable)
continue;
if (nr_slab == 0 &&
zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6))
if (nr_slab == 0 && !zone_reclaimable(zone))
zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and

View file

@ -138,11 +138,24 @@ static void refresh_zone_stat_thresholds(void)
int threshold;
for_each_populated_zone(zone) {
unsigned long max_drift, tolerate_drift;
threshold = calculate_threshold(zone);
for_each_online_cpu(cpu)
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
= threshold;
/*
* Only set percpu_drift_mark if there is a danger that
* NR_FREE_PAGES reports the low watermark is ok when in fact
* the min watermark could be breached by an allocation
*/
tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
max_drift = num_online_cpus() * threshold;
if (max_drift > tolerate_drift)
zone->percpu_drift_mark = high_wmark_pages(zone) +
max_drift;
}
}
@ -813,7 +826,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n scanned %lu"
"\n spanned %lu"
"\n present %lu",
zone_page_state(zone, NR_FREE_PAGES),
zone_nr_free_pages(zone),
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
@ -998,6 +1011,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
refresh_zone_stat_thresholds();
start_cpu_timer(cpu);
node_set_state(cpu_to_node(cpu), N_CPU);
break;