mirror of
https://github.com/Fishwaldo/Star64_linux.git
synced 2025-03-17 20:54:10 +00:00
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: "Incoming: - a small number of updates to scripts/, ocfs2 and fs/buffer.c - most of MM I still have quite a lot of material (mostly not MM) staged after linux-next due to -next dependencies. I'll send those across next week as the preprequisites get merged up" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (135 commits) mm/page_io.c: annotate refault stalls from swap_readpage mm/Kconfig: fix trivial help text punctuation mm/Kconfig: fix indentation mm/memory_hotplug.c: remove __online_page_set_limits() mm: fix typos in comments when calling __SetPageUptodate() mm: fix struct member name in function comments mm/shmem.c: cast the type of unmap_start to u64 mm: shmem: use proper gfp flags for shmem_writepage() mm/shmem.c: make array 'values' static const, makes object smaller userfaultfd: require CAP_SYS_PTRACE for UFFD_FEATURE_EVENT_FORK fs/userfaultfd.c: wp: clear VM_UFFD_MISSING or VM_UFFD_WP during userfaultfd_register() userfaultfd: wrap the common dst_vma check into an inlined function userfaultfd: remove unnecessary WARN_ON() in __mcopy_atomic_hugetlb() userfaultfd: use vma_pagesize for all huge page size calculation mm/madvise.c: use PAGE_ALIGN[ED] for range checking mm/madvise.c: replace with page_size() in madvise_inject_error() mm/mmap.c: make vma_merge() comment more easy to understand mm/hwpoison-inject: use DEFINE_DEBUGFS_ATTRIBUTE to define debugfs fops autonuma: reduce cache footprint when scanning page tables autonuma: fix watermark checking in migrate_balanced_pgdat() ...
This commit is contained in:
commit
596cf45cbf
95 changed files with 2679 additions and 1524 deletions
|
@ -1288,7 +1288,12 @@ PAGE_SIZE multiple when read back.
|
||||||
inactive_anon, active_anon, inactive_file, active_file, unevictable
|
inactive_anon, active_anon, inactive_file, active_file, unevictable
|
||||||
Amount of memory, swap-backed and filesystem-backed,
|
Amount of memory, swap-backed and filesystem-backed,
|
||||||
on the internal memory management lists used by the
|
on the internal memory management lists used by the
|
||||||
page reclaim algorithm
|
page reclaim algorithm.
|
||||||
|
|
||||||
|
As these represent internal list state (eg. shmem pages are on anon
|
||||||
|
memory management lists), inactive_foo + active_foo may not be equal to
|
||||||
|
the value for the foo counter, since the foo counter is type-based, not
|
||||||
|
list-based.
|
||||||
|
|
||||||
slab_reclaimable
|
slab_reclaimable
|
||||||
Part of "slab" that might be reclaimed, such as
|
Part of "slab" that might be reclaimed, such as
|
||||||
|
|
|
@ -218,3 +218,66 @@ brk handler is used to print bug reports.
|
||||||
A potential expansion of this mode is a hardware tag-based mode, which would
|
A potential expansion of this mode is a hardware tag-based mode, which would
|
||||||
use hardware memory tagging support instead of compiler instrumentation and
|
use hardware memory tagging support instead of compiler instrumentation and
|
||||||
manual shadow memory manipulation.
|
manual shadow memory manipulation.
|
||||||
|
|
||||||
|
What memory accesses are sanitised by KASAN?
|
||||||
|
--------------------------------------------
|
||||||
|
|
||||||
|
The kernel maps memory in a number of different parts of the address
|
||||||
|
space. This poses something of a problem for KASAN, which requires
|
||||||
|
that all addresses accessed by instrumented code have a valid shadow
|
||||||
|
region.
|
||||||
|
|
||||||
|
The range of kernel virtual addresses is large: there is not enough
|
||||||
|
real memory to support a real shadow region for every address that
|
||||||
|
could be accessed by the kernel.
|
||||||
|
|
||||||
|
By default
|
||||||
|
~~~~~~~~~~
|
||||||
|
|
||||||
|
By default, architectures only map real memory over the shadow region
|
||||||
|
for the linear mapping (and potentially other small areas). For all
|
||||||
|
other areas - such as vmalloc and vmemmap space - a single read-only
|
||||||
|
page is mapped over the shadow area. This read-only shadow page
|
||||||
|
declares all memory accesses as permitted.
|
||||||
|
|
||||||
|
This presents a problem for modules: they do not live in the linear
|
||||||
|
mapping, but in a dedicated module space. By hooking in to the module
|
||||||
|
allocator, KASAN can temporarily map real shadow memory to cover
|
||||||
|
them. This allows detection of invalid accesses to module globals, for
|
||||||
|
example.
|
||||||
|
|
||||||
|
This also creates an incompatibility with ``VMAP_STACK``: if the stack
|
||||||
|
lives in vmalloc space, it will be shadowed by the read-only page, and
|
||||||
|
the kernel will fault when trying to set up the shadow data for stack
|
||||||
|
variables.
|
||||||
|
|
||||||
|
CONFIG_KASAN_VMALLOC
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
With ``CONFIG_KASAN_VMALLOC``, KASAN can cover vmalloc space at the
|
||||||
|
cost of greater memory usage. Currently this is only supported on x86.
|
||||||
|
|
||||||
|
This works by hooking into vmalloc and vmap, and dynamically
|
||||||
|
allocating real shadow memory to back the mappings.
|
||||||
|
|
||||||
|
Most mappings in vmalloc space are small, requiring less than a full
|
||||||
|
page of shadow space. Allocating a full shadow page per mapping would
|
||||||
|
therefore be wasteful. Furthermore, to ensure that different mappings
|
||||||
|
use different shadow pages, mappings would have to be aligned to
|
||||||
|
``KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE``.
|
||||||
|
|
||||||
|
Instead, we share backing space across multiple mappings. We allocate
|
||||||
|
a backing page when a mapping in vmalloc space uses a particular page
|
||||||
|
of the shadow region. This page can be shared by other vmalloc
|
||||||
|
mappings later on.
|
||||||
|
|
||||||
|
We hook in to the vmap infrastructure to lazily clean up unused shadow
|
||||||
|
memory.
|
||||||
|
|
||||||
|
To avoid the difficulties around swapping mappings around, we expect
|
||||||
|
that the part of the shadow region that covers the vmalloc space will
|
||||||
|
not be covered by the early shadow page, but will be left
|
||||||
|
unmapped. This will require changes in arch-specific code.
|
||||||
|
|
||||||
|
This allows ``VMAP_STACK`` support on x86, and can simplify support of
|
||||||
|
architectures that do not have a fixed module region.
|
||||||
|
|
|
@ -836,16 +836,17 @@ config HAVE_ARCH_VMAP_STACK
|
||||||
config VMAP_STACK
|
config VMAP_STACK
|
||||||
default y
|
default y
|
||||||
bool "Use a virtually-mapped stack"
|
bool "Use a virtually-mapped stack"
|
||||||
depends on HAVE_ARCH_VMAP_STACK && !KASAN
|
depends on HAVE_ARCH_VMAP_STACK
|
||||||
|
depends on !KASAN || KASAN_VMALLOC
|
||||||
---help---
|
---help---
|
||||||
Enable this if you want the use virtually-mapped kernel stacks
|
Enable this if you want the use virtually-mapped kernel stacks
|
||||||
with guard pages. This causes kernel stack overflows to be
|
with guard pages. This causes kernel stack overflows to be
|
||||||
caught immediately rather than causing difficult-to-diagnose
|
caught immediately rather than causing difficult-to-diagnose
|
||||||
corruption.
|
corruption.
|
||||||
|
|
||||||
This is presently incompatible with KASAN because KASAN expects
|
To use this with KASAN, the architecture must support backing
|
||||||
the stack to map directly to the KASAN shadow map using a formula
|
virtual mappings with real shadow memory, and KASAN_VMALLOC must
|
||||||
that is incorrect if the stack is in vmalloc space.
|
be enabled.
|
||||||
|
|
||||||
config ARCH_OPTIONAL_KERNEL_RWX
|
config ARCH_OPTIONAL_KERNEL_RWX
|
||||||
def_bool n
|
def_bool n
|
||||||
|
|
|
@ -33,7 +33,6 @@
|
||||||
#define _ASM_ARC_PGTABLE_H
|
#define _ASM_ARC_PGTABLE_H
|
||||||
|
|
||||||
#include <linux/bits.h>
|
#include <linux/bits.h>
|
||||||
#define __ARCH_USE_5LEVEL_HACK
|
|
||||||
#include <asm-generic/pgtable-nopmd.h>
|
#include <asm-generic/pgtable-nopmd.h>
|
||||||
#include <asm/page.h>
|
#include <asm/page.h>
|
||||||
#include <asm/mmu.h> /* to propagate CONFIG_ARC_MMU_VER <n> */
|
#include <asm/mmu.h> /* to propagate CONFIG_ARC_MMU_VER <n> */
|
||||||
|
|
|
@ -30,6 +30,7 @@ noinline static int handle_kernel_vaddr_fault(unsigned long address)
|
||||||
* with the 'reference' page table.
|
* with the 'reference' page table.
|
||||||
*/
|
*/
|
||||||
pgd_t *pgd, *pgd_k;
|
pgd_t *pgd, *pgd_k;
|
||||||
|
p4d_t *p4d, *p4d_k;
|
||||||
pud_t *pud, *pud_k;
|
pud_t *pud, *pud_k;
|
||||||
pmd_t *pmd, *pmd_k;
|
pmd_t *pmd, *pmd_k;
|
||||||
|
|
||||||
|
@ -39,8 +40,13 @@ noinline static int handle_kernel_vaddr_fault(unsigned long address)
|
||||||
if (!pgd_present(*pgd_k))
|
if (!pgd_present(*pgd_k))
|
||||||
goto bad_area;
|
goto bad_area;
|
||||||
|
|
||||||
pud = pud_offset(pgd, address);
|
p4d = p4d_offset(pgd, address);
|
||||||
pud_k = pud_offset(pgd_k, address);
|
p4d_k = p4d_offset(pgd_k, address);
|
||||||
|
if (!p4d_present(*p4d_k))
|
||||||
|
goto bad_area;
|
||||||
|
|
||||||
|
pud = pud_offset(p4d, address);
|
||||||
|
pud_k = pud_offset(p4d_k, address);
|
||||||
if (!pud_present(*pud_k))
|
if (!pud_present(*pud_k))
|
||||||
goto bad_area;
|
goto bad_area;
|
||||||
|
|
||||||
|
|
|
@ -111,12 +111,14 @@ EXPORT_SYMBOL(__kunmap_atomic);
|
||||||
static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr)
|
static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr)
|
||||||
{
|
{
|
||||||
pgd_t *pgd_k;
|
pgd_t *pgd_k;
|
||||||
|
p4d_t *p4d_k;
|
||||||
pud_t *pud_k;
|
pud_t *pud_k;
|
||||||
pmd_t *pmd_k;
|
pmd_t *pmd_k;
|
||||||
pte_t *pte_k;
|
pte_t *pte_k;
|
||||||
|
|
||||||
pgd_k = pgd_offset_k(kvaddr);
|
pgd_k = pgd_offset_k(kvaddr);
|
||||||
pud_k = pud_offset(pgd_k, kvaddr);
|
p4d_k = p4d_offset(pgd_k, kvaddr);
|
||||||
|
pud_k = pud_offset(p4d_k, kvaddr);
|
||||||
pmd_k = pmd_offset(pud_k, kvaddr);
|
pmd_k = pmd_offset(pud_k, kvaddr);
|
||||||
|
|
||||||
pte_k = (pte_t *)memblock_alloc_low(PAGE_SIZE, PAGE_SIZE);
|
pte_k = (pte_t *)memblock_alloc_low(PAGE_SIZE, PAGE_SIZE);
|
||||||
|
|
|
@ -70,9 +70,6 @@ static inline int get_hugepd_cache_index(int index)
|
||||||
/* should not reach */
|
/* should not reach */
|
||||||
}
|
}
|
||||||
|
|
||||||
#else /* !CONFIG_HUGETLB_PAGE */
|
|
||||||
static inline int pmd_huge(pmd_t pmd) { return 0; }
|
|
||||||
static inline int pud_huge(pud_t pud) { return 0; }
|
|
||||||
#endif /* CONFIG_HUGETLB_PAGE */
|
#endif /* CONFIG_HUGETLB_PAGE */
|
||||||
|
|
||||||
#endif /* __ASSEMBLY__ */
|
#endif /* __ASSEMBLY__ */
|
||||||
|
|
|
@ -59,9 +59,6 @@ static inline int get_hugepd_cache_index(int index)
|
||||||
BUG();
|
BUG();
|
||||||
}
|
}
|
||||||
|
|
||||||
#else /* !CONFIG_HUGETLB_PAGE */
|
|
||||||
static inline int pmd_huge(pmd_t pmd) { return 0; }
|
|
||||||
static inline int pud_huge(pud_t pud) { return 0; }
|
|
||||||
#endif /* CONFIG_HUGETLB_PAGE */
|
#endif /* CONFIG_HUGETLB_PAGE */
|
||||||
|
|
||||||
static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr,
|
static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr,
|
||||||
|
|
|
@ -13,6 +13,7 @@
|
||||||
#include <linux/memblock.h>
|
#include <linux/memblock.h>
|
||||||
#include <linux/of_fdt.h>
|
#include <linux/of_fdt.h>
|
||||||
#include <linux/mm.h>
|
#include <linux/mm.h>
|
||||||
|
#include <linux/hugetlb.h>
|
||||||
#include <linux/string_helpers.h>
|
#include <linux/string_helpers.h>
|
||||||
#include <linux/stop_machine.h>
|
#include <linux/stop_machine.h>
|
||||||
|
|
||||||
|
|
|
@ -134,6 +134,7 @@ config X86
|
||||||
select HAVE_ARCH_JUMP_LABEL
|
select HAVE_ARCH_JUMP_LABEL
|
||||||
select HAVE_ARCH_JUMP_LABEL_RELATIVE
|
select HAVE_ARCH_JUMP_LABEL_RELATIVE
|
||||||
select HAVE_ARCH_KASAN if X86_64
|
select HAVE_ARCH_KASAN if X86_64
|
||||||
|
select HAVE_ARCH_KASAN_VMALLOC if X86_64
|
||||||
select HAVE_ARCH_KGDB
|
select HAVE_ARCH_KGDB
|
||||||
select HAVE_ARCH_MMAP_RND_BITS if MMU
|
select HAVE_ARCH_MMAP_RND_BITS if MMU
|
||||||
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
|
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
|
||||||
|
|
|
@ -245,6 +245,49 @@ static void __init kasan_map_early_shadow(pgd_t *pgd)
|
||||||
} while (pgd++, addr = next, addr != end);
|
} while (pgd++, addr = next, addr != end);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void __init kasan_shallow_populate_p4ds(pgd_t *pgd,
|
||||||
|
unsigned long addr,
|
||||||
|
unsigned long end)
|
||||||
|
{
|
||||||
|
p4d_t *p4d;
|
||||||
|
unsigned long next;
|
||||||
|
void *p;
|
||||||
|
|
||||||
|
p4d = p4d_offset(pgd, addr);
|
||||||
|
do {
|
||||||
|
next = p4d_addr_end(addr, end);
|
||||||
|
|
||||||
|
if (p4d_none(*p4d)) {
|
||||||
|
p = early_alloc(PAGE_SIZE, NUMA_NO_NODE, true);
|
||||||
|
p4d_populate(&init_mm, p4d, p);
|
||||||
|
}
|
||||||
|
} while (p4d++, addr = next, addr != end);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __init kasan_shallow_populate_pgds(void *start, void *end)
|
||||||
|
{
|
||||||
|
unsigned long addr, next;
|
||||||
|
pgd_t *pgd;
|
||||||
|
void *p;
|
||||||
|
|
||||||
|
addr = (unsigned long)start;
|
||||||
|
pgd = pgd_offset_k(addr);
|
||||||
|
do {
|
||||||
|
next = pgd_addr_end(addr, (unsigned long)end);
|
||||||
|
|
||||||
|
if (pgd_none(*pgd)) {
|
||||||
|
p = early_alloc(PAGE_SIZE, NUMA_NO_NODE, true);
|
||||||
|
pgd_populate(&init_mm, pgd, p);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* we need to populate p4ds to be synced when running in
|
||||||
|
* four level mode - see sync_global_pgds_l4()
|
||||||
|
*/
|
||||||
|
kasan_shallow_populate_p4ds(pgd, addr, next);
|
||||||
|
} while (pgd++, addr = next, addr != (unsigned long)end);
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_KASAN_INLINE
|
#ifdef CONFIG_KASAN_INLINE
|
||||||
static int kasan_die_handler(struct notifier_block *self,
|
static int kasan_die_handler(struct notifier_block *self,
|
||||||
unsigned long val,
|
unsigned long val,
|
||||||
|
@ -354,6 +397,24 @@ void __init kasan_init(void)
|
||||||
|
|
||||||
kasan_populate_early_shadow(
|
kasan_populate_early_shadow(
|
||||||
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
|
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
|
||||||
|
kasan_mem_to_shadow((void *)VMALLOC_START));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we're in full vmalloc mode, don't back vmalloc space with early
|
||||||
|
* shadow pages. Instead, prepopulate pgds/p4ds so they are synced to
|
||||||
|
* the global table and we can populate the lower levels on demand.
|
||||||
|
*/
|
||||||
|
if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
|
||||||
|
kasan_shallow_populate_pgds(
|
||||||
|
kasan_mem_to_shadow((void *)VMALLOC_START),
|
||||||
|
kasan_mem_to_shadow((void *)VMALLOC_END));
|
||||||
|
else
|
||||||
|
kasan_populate_early_shadow(
|
||||||
|
kasan_mem_to_shadow((void *)VMALLOC_START),
|
||||||
|
kasan_mem_to_shadow((void *)VMALLOC_END));
|
||||||
|
|
||||||
|
kasan_populate_early_shadow(
|
||||||
|
kasan_mem_to_shadow((void *)VMALLOC_END + 1),
|
||||||
shadow_cpu_entry_begin);
|
shadow_cpu_entry_begin);
|
||||||
|
|
||||||
kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
|
kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
|
||||||
|
|
|
@ -19,15 +19,12 @@
|
||||||
#include <linux/memory.h>
|
#include <linux/memory.h>
|
||||||
#include <linux/memory_hotplug.h>
|
#include <linux/memory_hotplug.h>
|
||||||
#include <linux/mm.h>
|
#include <linux/mm.h>
|
||||||
#include <linux/mutex.h>
|
|
||||||
#include <linux/stat.h>
|
#include <linux/stat.h>
|
||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
|
|
||||||
#include <linux/atomic.h>
|
#include <linux/atomic.h>
|
||||||
#include <linux/uaccess.h>
|
#include <linux/uaccess.h>
|
||||||
|
|
||||||
static DEFINE_MUTEX(mem_sysfs_mutex);
|
|
||||||
|
|
||||||
#define MEMORY_CLASS_NAME "memory"
|
#define MEMORY_CLASS_NAME "memory"
|
||||||
|
|
||||||
#define to_memory_block(dev) container_of(dev, struct memory_block, dev)
|
#define to_memory_block(dev) container_of(dev, struct memory_block, dev)
|
||||||
|
@ -538,12 +535,7 @@ static ssize_t soft_offline_page_store(struct device *dev,
|
||||||
if (kstrtoull(buf, 0, &pfn) < 0)
|
if (kstrtoull(buf, 0, &pfn) < 0)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
pfn >>= PAGE_SHIFT;
|
pfn >>= PAGE_SHIFT;
|
||||||
if (!pfn_valid(pfn))
|
ret = soft_offline_page(pfn, 0);
|
||||||
return -ENXIO;
|
|
||||||
/* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
|
|
||||||
if (!pfn_to_online_page(pfn))
|
|
||||||
return -EIO;
|
|
||||||
ret = soft_offline_page(pfn_to_page(pfn), 0);
|
|
||||||
return ret == 0 ? count : ret;
|
return ret == 0 ? count : ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -705,6 +697,8 @@ static void unregister_memory(struct memory_block *memory)
|
||||||
* Create memory block devices for the given memory area. Start and size
|
* Create memory block devices for the given memory area. Start and size
|
||||||
* have to be aligned to memory block granularity. Memory block devices
|
* have to be aligned to memory block granularity. Memory block devices
|
||||||
* will be initialized as offline.
|
* will be initialized as offline.
|
||||||
|
*
|
||||||
|
* Called under device_hotplug_lock.
|
||||||
*/
|
*/
|
||||||
int create_memory_block_devices(unsigned long start, unsigned long size)
|
int create_memory_block_devices(unsigned long start, unsigned long size)
|
||||||
{
|
{
|
||||||
|
@ -718,7 +712,6 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
|
||||||
!IS_ALIGNED(size, memory_block_size_bytes())))
|
!IS_ALIGNED(size, memory_block_size_bytes())))
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
mutex_lock(&mem_sysfs_mutex);
|
|
||||||
for (block_id = start_block_id; block_id != end_block_id; block_id++) {
|
for (block_id = start_block_id; block_id != end_block_id; block_id++) {
|
||||||
ret = init_memory_block(&mem, block_id, MEM_OFFLINE);
|
ret = init_memory_block(&mem, block_id, MEM_OFFLINE);
|
||||||
if (ret)
|
if (ret)
|
||||||
|
@ -730,11 +723,12 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
|
||||||
for (block_id = start_block_id; block_id != end_block_id;
|
for (block_id = start_block_id; block_id != end_block_id;
|
||||||
block_id++) {
|
block_id++) {
|
||||||
mem = find_memory_block_by_id(block_id);
|
mem = find_memory_block_by_id(block_id);
|
||||||
|
if (WARN_ON_ONCE(!mem))
|
||||||
|
continue;
|
||||||
mem->section_count = 0;
|
mem->section_count = 0;
|
||||||
unregister_memory(mem);
|
unregister_memory(mem);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
mutex_unlock(&mem_sysfs_mutex);
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -742,6 +736,8 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
|
||||||
* Remove memory block devices for the given memory area. Start and size
|
* Remove memory block devices for the given memory area. Start and size
|
||||||
* have to be aligned to memory block granularity. Memory block devices
|
* have to be aligned to memory block granularity. Memory block devices
|
||||||
* have to be offline.
|
* have to be offline.
|
||||||
|
*
|
||||||
|
* Called under device_hotplug_lock.
|
||||||
*/
|
*/
|
||||||
void remove_memory_block_devices(unsigned long start, unsigned long size)
|
void remove_memory_block_devices(unsigned long start, unsigned long size)
|
||||||
{
|
{
|
||||||
|
@ -754,7 +750,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size)
|
||||||
!IS_ALIGNED(size, memory_block_size_bytes())))
|
!IS_ALIGNED(size, memory_block_size_bytes())))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
mutex_lock(&mem_sysfs_mutex);
|
|
||||||
for (block_id = start_block_id; block_id != end_block_id; block_id++) {
|
for (block_id = start_block_id; block_id != end_block_id; block_id++) {
|
||||||
mem = find_memory_block_by_id(block_id);
|
mem = find_memory_block_by_id(block_id);
|
||||||
if (WARN_ON_ONCE(!mem))
|
if (WARN_ON_ONCE(!mem))
|
||||||
|
@ -763,7 +758,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size)
|
||||||
unregister_memory_block_under_nodes(mem);
|
unregister_memory_block_under_nodes(mem);
|
||||||
unregister_memory(mem);
|
unregister_memory(mem);
|
||||||
}
|
}
|
||||||
mutex_unlock(&mem_sysfs_mutex);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* return true if the memory block is offlined, otherwise, return false */
|
/* return true if the memory block is offlined, otherwise, return false */
|
||||||
|
@ -797,12 +791,13 @@ static const struct attribute_group *memory_root_attr_groups[] = {
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Initialize the sysfs support for memory devices...
|
* Initialize the sysfs support for memory devices. At the time this function
|
||||||
|
* is called, we cannot have concurrent creation/deletion of memory block
|
||||||
|
* devices, the device_hotplug_lock is not needed.
|
||||||
*/
|
*/
|
||||||
void __init memory_dev_init(void)
|
void __init memory_dev_init(void)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
int err;
|
|
||||||
unsigned long block_sz, nr;
|
unsigned long block_sz, nr;
|
||||||
|
|
||||||
/* Validate the configured memory block size */
|
/* Validate the configured memory block size */
|
||||||
|
@ -813,24 +808,19 @@ void __init memory_dev_init(void)
|
||||||
|
|
||||||
ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
|
ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto out;
|
panic("%s() failed to register subsystem: %d\n", __func__, ret);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Create entries for memory sections that were found
|
* Create entries for memory sections that were found
|
||||||
* during boot and have been initialized
|
* during boot and have been initialized
|
||||||
*/
|
*/
|
||||||
mutex_lock(&mem_sysfs_mutex);
|
|
||||||
for (nr = 0; nr <= __highest_present_section_nr;
|
for (nr = 0; nr <= __highest_present_section_nr;
|
||||||
nr += sections_per_block) {
|
nr += sections_per_block) {
|
||||||
err = add_memory_block(nr);
|
ret = add_memory_block(nr);
|
||||||
if (!ret)
|
if (ret)
|
||||||
ret = err;
|
panic("%s() failed to add memory block: %d\n", __func__,
|
||||||
|
ret);
|
||||||
}
|
}
|
||||||
mutex_unlock(&mem_sysfs_mutex);
|
|
||||||
|
|
||||||
out:
|
|
||||||
if (ret)
|
|
||||||
panic("%s() failed: %d\n", __func__, ret);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -682,9 +682,7 @@ static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg)
|
||||||
__ClearPageOffline(pg);
|
__ClearPageOffline(pg);
|
||||||
|
|
||||||
/* This frame is currently backed; online the page. */
|
/* This frame is currently backed; online the page. */
|
||||||
__online_page_set_limits(pg);
|
generic_online_page(pg, 0);
|
||||||
__online_page_increment_counters(pg);
|
|
||||||
__online_page_free(pg);
|
|
||||||
|
|
||||||
lockdep_assert_held(&dm_device.ha_lock);
|
lockdep_assert_held(&dm_device.ha_lock);
|
||||||
dm_device.num_pages_onlined++;
|
dm_device.num_pages_onlined++;
|
||||||
|
|
|
@ -374,7 +374,6 @@ static void xen_online_page(struct page *page, unsigned int order)
|
||||||
mutex_lock(&balloon_mutex);
|
mutex_lock(&balloon_mutex);
|
||||||
for (i = 0; i < size; i++) {
|
for (i = 0; i < size; i++) {
|
||||||
p = pfn_to_page(start_pfn + i);
|
p = pfn_to_page(start_pfn + i);
|
||||||
__online_page_set_limits(p);
|
|
||||||
balloon_append(p);
|
balloon_append(p);
|
||||||
}
|
}
|
||||||
mutex_unlock(&balloon_mutex);
|
mutex_unlock(&balloon_mutex);
|
||||||
|
|
|
@ -49,6 +49,8 @@
|
||||||
#include <trace/events/block.h>
|
#include <trace/events/block.h>
|
||||||
#include <linux/fscrypt.h>
|
#include <linux/fscrypt.h>
|
||||||
|
|
||||||
|
#include "internal.h"
|
||||||
|
|
||||||
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
|
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
|
||||||
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
|
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
|
||||||
enum rw_hint hint, struct writeback_control *wbc);
|
enum rw_hint hint, struct writeback_control *wbc);
|
||||||
|
@ -1423,10 +1425,10 @@ static bool has_bh_in_lru(int cpu, void *dummy)
|
||||||
|
|
||||||
for (i = 0; i < BH_LRU_SIZE; i++) {
|
for (i = 0; i < BH_LRU_SIZE; i++) {
|
||||||
if (b->bhs[i])
|
if (b->bhs[i])
|
||||||
return 1;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void invalidate_bh_lrus(void)
|
void invalidate_bh_lrus(void)
|
||||||
|
|
|
@ -220,27 +220,6 @@ static inline struct page *dio_get_page(struct dio *dio,
|
||||||
return dio->pages[sdio->head];
|
return dio->pages[sdio->head];
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Warn about a page cache invalidation failure during a direct io write.
|
|
||||||
*/
|
|
||||||
void dio_warn_stale_pagecache(struct file *filp)
|
|
||||||
{
|
|
||||||
static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
|
|
||||||
char pathname[128];
|
|
||||||
struct inode *inode = file_inode(filp);
|
|
||||||
char *path;
|
|
||||||
|
|
||||||
errseq_set(&inode->i_mapping->wb_err, -EIO);
|
|
||||||
if (__ratelimit(&_rs)) {
|
|
||||||
path = file_path(filp, pathname, sizeof(pathname));
|
|
||||||
if (IS_ERR(path))
|
|
||||||
path = "(unknown)";
|
|
||||||
pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");
|
|
||||||
pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
|
|
||||||
current->comm);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* dio_complete() - called when all DIO BIO I/O has been completed
|
* dio_complete() - called when all DIO BIO I/O has been completed
|
||||||
*
|
*
|
||||||
|
|
|
@ -440,7 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
||||||
u32 hash;
|
u32 hash;
|
||||||
|
|
||||||
index = page->index;
|
index = page->index;
|
||||||
hash = hugetlb_fault_mutex_hash(h, mapping, index, 0);
|
hash = hugetlb_fault_mutex_hash(mapping, index);
|
||||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -644,7 +644,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
|
||||||
addr = index * hpage_size;
|
addr = index * hpage_size;
|
||||||
|
|
||||||
/* mutex taken here, fault path and hole punch */
|
/* mutex taken here, fault path and hole punch */
|
||||||
hash = hugetlb_fault_mutex_hash(h, mapping, index, addr);
|
hash = hugetlb_fault_mutex_hash(mapping, index);
|
||||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
|
||||||
/* See if already present in mapping to avoid alloc/free */
|
/* See if already present in mapping to avoid alloc/free */
|
||||||
|
@ -815,8 +815,11 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
|
||||||
/*
|
/*
|
||||||
* File creation. Allocate an inode, and we're done..
|
* File creation. Allocate an inode, and we're done..
|
||||||
*/
|
*/
|
||||||
static int hugetlbfs_mknod(struct inode *dir,
|
static int do_hugetlbfs_mknod(struct inode *dir,
|
||||||
struct dentry *dentry, umode_t mode, dev_t dev)
|
struct dentry *dentry,
|
||||||
|
umode_t mode,
|
||||||
|
dev_t dev,
|
||||||
|
bool tmpfile)
|
||||||
{
|
{
|
||||||
struct inode *inode;
|
struct inode *inode;
|
||||||
int error = -ENOSPC;
|
int error = -ENOSPC;
|
||||||
|
@ -824,13 +827,23 @@ static int hugetlbfs_mknod(struct inode *dir,
|
||||||
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
|
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
|
||||||
if (inode) {
|
if (inode) {
|
||||||
dir->i_ctime = dir->i_mtime = current_time(dir);
|
dir->i_ctime = dir->i_mtime = current_time(dir);
|
||||||
d_instantiate(dentry, inode);
|
if (tmpfile) {
|
||||||
dget(dentry); /* Extra count - pin the dentry in core */
|
d_tmpfile(dentry, inode);
|
||||||
|
} else {
|
||||||
|
d_instantiate(dentry, inode);
|
||||||
|
dget(dentry);/* Extra count - pin the dentry in core */
|
||||||
|
}
|
||||||
error = 0;
|
error = 0;
|
||||||
}
|
}
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int hugetlbfs_mknod(struct inode *dir,
|
||||||
|
struct dentry *dentry, umode_t mode, dev_t dev)
|
||||||
|
{
|
||||||
|
return do_hugetlbfs_mknod(dir, dentry, mode, dev, false);
|
||||||
|
}
|
||||||
|
|
||||||
static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
||||||
{
|
{
|
||||||
int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
|
int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
|
||||||
|
@ -844,6 +857,12 @@ static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mo
|
||||||
return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
|
return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int hugetlbfs_tmpfile(struct inode *dir,
|
||||||
|
struct dentry *dentry, umode_t mode)
|
||||||
|
{
|
||||||
|
return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true);
|
||||||
|
}
|
||||||
|
|
||||||
static int hugetlbfs_symlink(struct inode *dir,
|
static int hugetlbfs_symlink(struct inode *dir,
|
||||||
struct dentry *dentry, const char *symname)
|
struct dentry *dentry, const char *symname)
|
||||||
{
|
{
|
||||||
|
@ -1102,6 +1121,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations = {
|
||||||
.mknod = hugetlbfs_mknod,
|
.mknod = hugetlbfs_mknod,
|
||||||
.rename = simple_rename,
|
.rename = simple_rename,
|
||||||
.setattr = hugetlbfs_setattr,
|
.setattr = hugetlbfs_setattr,
|
||||||
|
.tmpfile = hugetlbfs_tmpfile,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const struct inode_operations hugetlbfs_inode_operations = {
|
static const struct inode_operations hugetlbfs_inode_operations = {
|
||||||
|
@ -1461,28 +1481,41 @@ static int __init init_hugetlbfs_fs(void)
|
||||||
sizeof(struct hugetlbfs_inode_info),
|
sizeof(struct hugetlbfs_inode_info),
|
||||||
0, SLAB_ACCOUNT, init_once);
|
0, SLAB_ACCOUNT, init_once);
|
||||||
if (hugetlbfs_inode_cachep == NULL)
|
if (hugetlbfs_inode_cachep == NULL)
|
||||||
goto out2;
|
goto out;
|
||||||
|
|
||||||
error = register_filesystem(&hugetlbfs_fs_type);
|
error = register_filesystem(&hugetlbfs_fs_type);
|
||||||
if (error)
|
if (error)
|
||||||
goto out;
|
goto out_free;
|
||||||
|
|
||||||
|
/* default hstate mount is required */
|
||||||
|
mnt = mount_one_hugetlbfs(&hstates[default_hstate_idx]);
|
||||||
|
if (IS_ERR(mnt)) {
|
||||||
|
error = PTR_ERR(mnt);
|
||||||
|
goto out_unreg;
|
||||||
|
}
|
||||||
|
hugetlbfs_vfsmount[default_hstate_idx] = mnt;
|
||||||
|
|
||||||
|
/* other hstates are optional */
|
||||||
i = 0;
|
i = 0;
|
||||||
for_each_hstate(h) {
|
for_each_hstate(h) {
|
||||||
|
if (i == default_hstate_idx)
|
||||||
|
continue;
|
||||||
|
|
||||||
mnt = mount_one_hugetlbfs(h);
|
mnt = mount_one_hugetlbfs(h);
|
||||||
if (IS_ERR(mnt) && i == 0) {
|
if (IS_ERR(mnt))
|
||||||
error = PTR_ERR(mnt);
|
hugetlbfs_vfsmount[i] = NULL;
|
||||||
goto out;
|
else
|
||||||
}
|
hugetlbfs_vfsmount[i] = mnt;
|
||||||
hugetlbfs_vfsmount[i] = mnt;
|
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
out:
|
out_unreg:
|
||||||
|
(void)unregister_filesystem(&hugetlbfs_fs_type);
|
||||||
|
out_free:
|
||||||
kmem_cache_destroy(hugetlbfs_inode_cachep);
|
kmem_cache_destroy(hugetlbfs_inode_cachep);
|
||||||
out2:
|
out:
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
fs_initcall(init_hugetlbfs_fs)
|
fs_initcall(init_hugetlbfs_fs)
|
||||||
|
|
|
@ -327,8 +327,8 @@ int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh)
|
||||||
down_read(&OCFS2_I(inode)->ip_xattr_sem);
|
down_read(&OCFS2_I(inode)->ip_xattr_sem);
|
||||||
acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh);
|
acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh);
|
||||||
up_read(&OCFS2_I(inode)->ip_xattr_sem);
|
up_read(&OCFS2_I(inode)->ip_xattr_sem);
|
||||||
if (IS_ERR(acl) || !acl)
|
if (IS_ERR_OR_NULL(acl))
|
||||||
return PTR_ERR(acl);
|
return PTR_ERR_OR_ZERO(acl);
|
||||||
ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
|
ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
|
||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
return ret;
|
||||||
|
|
|
@ -1460,7 +1460,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
|
||||||
start = vma->vm_start;
|
start = vma->vm_start;
|
||||||
vma_end = min(end, vma->vm_end);
|
vma_end = min(end, vma->vm_end);
|
||||||
|
|
||||||
new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
|
new_flags = (vma->vm_flags &
|
||||||
|
~(VM_UFFD_MISSING|VM_UFFD_WP)) | vm_flags;
|
||||||
prev = vma_merge(mm, prev, start, vma_end, new_flags,
|
prev = vma_merge(mm, prev, start, vma_end, new_flags,
|
||||||
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
|
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
|
||||||
vma_policy(vma),
|
vma_policy(vma),
|
||||||
|
@ -1834,13 +1835,12 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
|
||||||
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
|
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
|
||||||
goto out;
|
goto out;
|
||||||
features = uffdio_api.features;
|
features = uffdio_api.features;
|
||||||
if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) {
|
ret = -EINVAL;
|
||||||
memset(&uffdio_api, 0, sizeof(uffdio_api));
|
if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
|
||||||
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
|
goto err_out;
|
||||||
goto out;
|
ret = -EPERM;
|
||||||
ret = -EINVAL;
|
if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
|
||||||
goto out;
|
goto err_out;
|
||||||
}
|
|
||||||
/* report all available features and ioctls to userland */
|
/* report all available features and ioctls to userland */
|
||||||
uffdio_api.features = UFFD_API_FEATURES;
|
uffdio_api.features = UFFD_API_FEATURES;
|
||||||
uffdio_api.ioctls = UFFD_API_IOCTLS;
|
uffdio_api.ioctls = UFFD_API_IOCTLS;
|
||||||
|
@ -1853,6 +1853,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
|
||||||
ret = 0;
|
ret = 0;
|
||||||
out:
|
out:
|
||||||
return ret;
|
return ret;
|
||||||
|
err_out:
|
||||||
|
memset(&uffdio_api, 0, sizeof(uffdio_api));
|
||||||
|
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
|
||||||
|
ret = -EFAULT;
|
||||||
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
static long userfaultfd_ioctl(struct file *file, unsigned cmd,
|
static long userfaultfd_ioctl(struct file *file, unsigned cmd,
|
||||||
|
|
|
@ -30,7 +30,6 @@
|
||||||
#undef pud_free_tlb
|
#undef pud_free_tlb
|
||||||
#define pud_free_tlb(tlb, x, addr) do { } while (0)
|
#define pud_free_tlb(tlb, x, addr) do { } while (0)
|
||||||
#define pud_free(mm, x) do { } while (0)
|
#define pud_free(mm, x) do { } while (0)
|
||||||
#define __pud_free_tlb(tlb, x, addr) do { } while (0)
|
|
||||||
|
|
||||||
#undef pud_addr_end
|
#undef pud_addr_end
|
||||||
#define pud_addr_end(addr, end) (end)
|
#define pud_addr_end(addr, end) (end)
|
||||||
|
|
|
@ -51,7 +51,6 @@ static inline int p4d_present(p4d_t p4d)
|
||||||
#undef p4d_free_tlb
|
#undef p4d_free_tlb
|
||||||
#define p4d_free_tlb(tlb, x, addr) do { } while (0)
|
#define p4d_free_tlb(tlb, x, addr) do { } while (0)
|
||||||
#define p4d_free(mm, x) do { } while (0)
|
#define p4d_free(mm, x) do { } while (0)
|
||||||
#define __p4d_free_tlb(tlb, x, addr) do { } while (0)
|
|
||||||
|
|
||||||
#undef p4d_addr_end
|
#undef p4d_addr_end
|
||||||
#define p4d_addr_end(addr, end) (end)
|
#define p4d_addr_end(addr, end) (end)
|
||||||
|
|
|
@ -50,7 +50,7 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
|
||||||
*/
|
*/
|
||||||
#define p4d_alloc_one(mm, address) NULL
|
#define p4d_alloc_one(mm, address) NULL
|
||||||
#define p4d_free(mm, x) do { } while (0)
|
#define p4d_free(mm, x) do { } while (0)
|
||||||
#define __p4d_free_tlb(tlb, x, a) do { } while (0)
|
#define p4d_free_tlb(tlb, x, a) do { } while (0)
|
||||||
|
|
||||||
#undef p4d_addr_end
|
#undef p4d_addr_end
|
||||||
#define p4d_addr_end(addr, end) (end)
|
#define p4d_addr_end(addr, end) (end)
|
||||||
|
|
|
@ -60,7 +60,7 @@ static inline pmd_t * pmd_offset(pud_t * pud, unsigned long address)
|
||||||
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
|
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
#define __pmd_free_tlb(tlb, x, a) do { } while (0)
|
#define pmd_free_tlb(tlb, x, a) do { } while (0)
|
||||||
|
|
||||||
#undef pmd_addr_end
|
#undef pmd_addr_end
|
||||||
#define pmd_addr_end(addr, end) (end)
|
#define pmd_addr_end(addr, end) (end)
|
||||||
|
|
|
@ -59,7 +59,7 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
|
||||||
*/
|
*/
|
||||||
#define pud_alloc_one(mm, address) NULL
|
#define pud_alloc_one(mm, address) NULL
|
||||||
#define pud_free(mm, x) do { } while (0)
|
#define pud_free(mm, x) do { } while (0)
|
||||||
#define __pud_free_tlb(tlb, x, a) do { } while (0)
|
#define pud_free_tlb(tlb, x, a) do { } while (0)
|
||||||
|
|
||||||
#undef pud_addr_end
|
#undef pud_addr_end
|
||||||
#define pud_addr_end(addr, end) (end)
|
#define pud_addr_end(addr, end) (end)
|
||||||
|
|
|
@ -558,8 +558,19 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
|
||||||
* Do the tests inline, but report and clear the bad entry in mm/memory.c.
|
* Do the tests inline, but report and clear the bad entry in mm/memory.c.
|
||||||
*/
|
*/
|
||||||
void pgd_clear_bad(pgd_t *);
|
void pgd_clear_bad(pgd_t *);
|
||||||
|
|
||||||
|
#ifndef __PAGETABLE_P4D_FOLDED
|
||||||
void p4d_clear_bad(p4d_t *);
|
void p4d_clear_bad(p4d_t *);
|
||||||
|
#else
|
||||||
|
#define p4d_clear_bad(p4d) do { } while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __PAGETABLE_PUD_FOLDED
|
||||||
void pud_clear_bad(pud_t *);
|
void pud_clear_bad(pud_t *);
|
||||||
|
#else
|
||||||
|
#define pud_clear_bad(p4d) do { } while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
void pmd_clear_bad(pmd_t *);
|
void pmd_clear_bad(pmd_t *);
|
||||||
|
|
||||||
static inline int pgd_none_or_clear_bad(pgd_t *pgd)
|
static inline int pgd_none_or_clear_bad(pgd_t *pgd)
|
||||||
|
@ -903,6 +914,21 @@ static inline int pud_write(pud_t pud)
|
||||||
}
|
}
|
||||||
#endif /* pud_write */
|
#endif /* pud_write */
|
||||||
|
|
||||||
|
#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
|
||||||
|
static inline int pmd_devmap(pmd_t pmd)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
static inline int pud_devmap(pud_t pud)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
static inline int pgd_devmap(pgd_t pgd)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
|
#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
|
||||||
(defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
|
(defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
|
||||||
!defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
|
!defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
|
||||||
|
@ -912,6 +938,31 @@ static inline int pud_trans_huge(pud_t pud)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* See pmd_none_or_trans_huge_or_clear_bad for discussion. */
|
||||||
|
static inline int pud_none_or_trans_huge_or_dev_or_clear_bad(pud_t *pud)
|
||||||
|
{
|
||||||
|
pud_t pudval = READ_ONCE(*pud);
|
||||||
|
|
||||||
|
if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval))
|
||||||
|
return 1;
|
||||||
|
if (unlikely(pud_bad(pudval))) {
|
||||||
|
pud_clear_bad(pud);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* See pmd_trans_unstable for discussion. */
|
||||||
|
static inline int pud_trans_unstable(pud_t *pud)
|
||||||
|
{
|
||||||
|
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
|
||||||
|
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
|
||||||
|
return pud_none_or_trans_huge_or_dev_or_clear_bad(pud);
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
#ifndef pmd_read_atomic
|
#ifndef pmd_read_atomic
|
||||||
static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
|
static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
|
||||||
{
|
{
|
||||||
|
|
|
@ -584,7 +584,6 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
|
||||||
} while (0)
|
} while (0)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef __ARCH_HAS_4LEVEL_HACK
|
|
||||||
#ifndef pud_free_tlb
|
#ifndef pud_free_tlb
|
||||||
#define pud_free_tlb(tlb, pudp, address) \
|
#define pud_free_tlb(tlb, pudp, address) \
|
||||||
do { \
|
do { \
|
||||||
|
@ -594,9 +593,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
|
||||||
__pud_free_tlb(tlb, pudp, address); \
|
__pud_free_tlb(tlb, pudp, address); \
|
||||||
} while (0)
|
} while (0)
|
||||||
#endif
|
#endif
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef __ARCH_HAS_5LEVEL_HACK
|
|
||||||
#ifndef p4d_free_tlb
|
#ifndef p4d_free_tlb
|
||||||
#define p4d_free_tlb(tlb, pudp, address) \
|
#define p4d_free_tlb(tlb, pudp, address) \
|
||||||
do { \
|
do { \
|
||||||
|
@ -605,7 +602,6 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
|
||||||
__p4d_free_tlb(tlb, pudp, address); \
|
__p4d_free_tlb(tlb, pudp, address); \
|
||||||
} while (0)
|
} while (0)
|
||||||
#endif
|
#endif
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif /* CONFIG_MMU */
|
#endif /* CONFIG_MMU */
|
||||||
|
|
||||||
|
|
|
@ -3156,7 +3156,6 @@ enum {
|
||||||
};
|
};
|
||||||
|
|
||||||
void dio_end_io(struct bio *bio);
|
void dio_end_io(struct bio *bio);
|
||||||
void dio_warn_stale_pagecache(struct file *filp);
|
|
||||||
|
|
||||||
ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
|
ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
|
||||||
struct block_device *bdev, struct iov_iter *iter,
|
struct block_device *bdev, struct iov_iter *iter,
|
||||||
|
@ -3201,6 +3200,11 @@ static inline void inode_dio_end(struct inode *inode)
|
||||||
wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
|
wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Warn about a page cache invalidation failure diring a direct I/O write.
|
||||||
|
*/
|
||||||
|
void dio_warn_stale_pagecache(struct file *filp);
|
||||||
|
|
||||||
extern void inode_set_flags(struct inode *inode, unsigned int flags,
|
extern void inode_set_flags(struct inode *inode, unsigned int flags,
|
||||||
unsigned int mask);
|
unsigned int mask);
|
||||||
|
|
||||||
|
|
|
@ -612,6 +612,8 @@ static inline bool pm_suspended_storage(void)
|
||||||
/* The below functions must be run on a range from a single zone. */
|
/* The below functions must be run on a range from a single zone. */
|
||||||
extern int alloc_contig_range(unsigned long start, unsigned long end,
|
extern int alloc_contig_range(unsigned long start, unsigned long end,
|
||||||
unsigned migratetype, gfp_t gfp_mask);
|
unsigned migratetype, gfp_t gfp_mask);
|
||||||
|
extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
|
||||||
|
int nid, nodemask_t *nodemask);
|
||||||
#endif
|
#endif
|
||||||
void free_contig_range(unsigned long pfn, unsigned int nr_pages);
|
void free_contig_range(unsigned long pfn, unsigned int nr_pages);
|
||||||
|
|
||||||
|
|
|
@ -105,8 +105,7 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
|
||||||
void free_huge_page(struct page *page);
|
void free_huge_page(struct page *page);
|
||||||
void hugetlb_fix_reserve_counts(struct inode *inode);
|
void hugetlb_fix_reserve_counts(struct inode *inode);
|
||||||
extern struct mutex *hugetlb_fault_mutex_table;
|
extern struct mutex *hugetlb_fault_mutex_table;
|
||||||
u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
|
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);
|
||||||
pgoff_t idx, unsigned long address);
|
|
||||||
|
|
||||||
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
|
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
|
||||||
|
|
||||||
|
@ -164,38 +163,130 @@ static inline void adjust_range_if_pmd_sharing_possible(
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
#define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n) ({ BUG(); 0; })
|
static inline long follow_hugetlb_page(struct mm_struct *mm,
|
||||||
#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL)
|
struct vm_area_struct *vma, struct page **pages,
|
||||||
#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
|
struct vm_area_struct **vmas, unsigned long *position,
|
||||||
|
unsigned long *nr_pages, long i, unsigned int flags,
|
||||||
|
int *nonblocking)
|
||||||
|
{
|
||||||
|
BUG();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline struct page *follow_huge_addr(struct mm_struct *mm,
|
||||||
|
unsigned long address, int write)
|
||||||
|
{
|
||||||
|
return ERR_PTR(-EINVAL);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int copy_hugetlb_page_range(struct mm_struct *dst,
|
||||||
|
struct mm_struct *src, struct vm_area_struct *vma)
|
||||||
|
{
|
||||||
|
BUG();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static inline void hugetlb_report_meminfo(struct seq_file *m)
|
static inline void hugetlb_report_meminfo(struct seq_file *m)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
#define hugetlb_report_node_meminfo(n, buf) 0
|
|
||||||
|
static inline int hugetlb_report_node_meminfo(int nid, char *buf)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static inline void hugetlb_show_meminfo(void)
|
static inline void hugetlb_show_meminfo(void)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
#define follow_huge_pd(vma, addr, hpd, flags, pdshift) NULL
|
|
||||||
#define follow_huge_pmd(mm, addr, pmd, flags) NULL
|
static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
|
||||||
#define follow_huge_pud(mm, addr, pud, flags) NULL
|
unsigned long address, hugepd_t hpd, int flags,
|
||||||
#define follow_huge_pgd(mm, addr, pgd, flags) NULL
|
int pdshift)
|
||||||
#define prepare_hugepage_range(file, addr, len) (-EINVAL)
|
{
|
||||||
#define pmd_huge(x) 0
|
return NULL;
|
||||||
#define pud_huge(x) 0
|
}
|
||||||
#define is_hugepage_only_range(mm, addr, len) 0
|
|
||||||
#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
|
static inline struct page *follow_huge_pmd(struct mm_struct *mm,
|
||||||
#define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
|
unsigned long address, pmd_t *pmd, int flags)
|
||||||
src_addr, pagep) ({ BUG(); 0; })
|
{
|
||||||
#define huge_pte_offset(mm, address, sz) 0
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline struct page *follow_huge_pud(struct mm_struct *mm,
|
||||||
|
unsigned long address, pud_t *pud, int flags)
|
||||||
|
{
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline struct page *follow_huge_pgd(struct mm_struct *mm,
|
||||||
|
unsigned long address, pgd_t *pgd, int flags)
|
||||||
|
{
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int prepare_hugepage_range(struct file *file,
|
||||||
|
unsigned long addr, unsigned long len)
|
||||||
|
{
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int pmd_huge(pmd_t pmd)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int pud_huge(pud_t pud)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int is_hugepage_only_range(struct mm_struct *mm,
|
||||||
|
unsigned long addr, unsigned long len)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
|
||||||
|
unsigned long addr, unsigned long end,
|
||||||
|
unsigned long floor, unsigned long ceiling)
|
||||||
|
{
|
||||||
|
BUG();
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
|
||||||
|
pte_t *dst_pte,
|
||||||
|
struct vm_area_struct *dst_vma,
|
||||||
|
unsigned long dst_addr,
|
||||||
|
unsigned long src_addr,
|
||||||
|
struct page **pagep)
|
||||||
|
{
|
||||||
|
BUG();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
|
||||||
|
unsigned long sz)
|
||||||
|
{
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool isolate_huge_page(struct page *page, struct list_head *list)
|
static inline bool isolate_huge_page(struct page *page, struct list_head *list)
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
#define putback_active_hugepage(p) do {} while (0)
|
|
||||||
#define move_hugetlb_state(old, new, reason) do {} while (0)
|
|
||||||
|
|
||||||
static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
|
static inline void putback_active_hugepage(struct page *page)
|
||||||
unsigned long address, unsigned long end, pgprot_t newprot)
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void move_hugetlb_state(struct page *oldpage,
|
||||||
|
struct page *newpage, int reason)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline unsigned long hugetlb_change_protection(
|
||||||
|
struct vm_area_struct *vma, unsigned long address,
|
||||||
|
unsigned long end, pgprot_t newprot)
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -213,9 +304,10 @@ static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
|
||||||
{
|
{
|
||||||
BUG();
|
BUG();
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
|
static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
|
||||||
struct vm_area_struct *vma, unsigned long address,
|
struct vm_area_struct *vma, unsigned long address,
|
||||||
unsigned int flags)
|
unsigned int flags)
|
||||||
{
|
{
|
||||||
BUG();
|
BUG();
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -70,8 +70,18 @@ struct kasan_cache {
|
||||||
int free_meta_offset;
|
int free_meta_offset;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* These functions provide a special case to support backing module
|
||||||
|
* allocations with real shadow memory. With KASAN vmalloc, the special
|
||||||
|
* case is unnecessary, as the work is handled in the generic case.
|
||||||
|
*/
|
||||||
|
#ifndef CONFIG_KASAN_VMALLOC
|
||||||
int kasan_module_alloc(void *addr, size_t size);
|
int kasan_module_alloc(void *addr, size_t size);
|
||||||
void kasan_free_shadow(const struct vm_struct *vm);
|
void kasan_free_shadow(const struct vm_struct *vm);
|
||||||
|
#else
|
||||||
|
static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
|
||||||
|
static inline void kasan_free_shadow(const struct vm_struct *vm) {}
|
||||||
|
#endif
|
||||||
|
|
||||||
int kasan_add_zero_shadow(void *start, unsigned long size);
|
int kasan_add_zero_shadow(void *start, unsigned long size);
|
||||||
void kasan_remove_zero_shadow(void *start, unsigned long size);
|
void kasan_remove_zero_shadow(void *start, unsigned long size);
|
||||||
|
@ -194,4 +204,25 @@ static inline void *kasan_reset_tag(const void *addr)
|
||||||
|
|
||||||
#endif /* CONFIG_KASAN_SW_TAGS */
|
#endif /* CONFIG_KASAN_SW_TAGS */
|
||||||
|
|
||||||
|
#ifdef CONFIG_KASAN_VMALLOC
|
||||||
|
int kasan_populate_vmalloc(unsigned long requested_size,
|
||||||
|
struct vm_struct *area);
|
||||||
|
void kasan_poison_vmalloc(void *start, unsigned long size);
|
||||||
|
void kasan_release_vmalloc(unsigned long start, unsigned long end,
|
||||||
|
unsigned long free_region_start,
|
||||||
|
unsigned long free_region_end);
|
||||||
|
#else
|
||||||
|
static inline int kasan_populate_vmalloc(unsigned long requested_size,
|
||||||
|
struct vm_struct *area)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void kasan_poison_vmalloc(void *start, unsigned long size) {}
|
||||||
|
static inline void kasan_release_vmalloc(unsigned long start,
|
||||||
|
unsigned long end,
|
||||||
|
unsigned long free_region_start,
|
||||||
|
unsigned long free_region_end) {}
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif /* LINUX_KASAN_H */
|
#endif /* LINUX_KASAN_H */
|
||||||
|
|
|
@ -358,6 +358,9 @@ static inline phys_addr_t memblock_phys_alloc(phys_addr_t size,
|
||||||
MEMBLOCK_ALLOC_ACCESSIBLE);
|
MEMBLOCK_ALLOC_ACCESSIBLE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void *memblock_alloc_exact_nid_raw(phys_addr_t size, phys_addr_t align,
|
||||||
|
phys_addr_t min_addr, phys_addr_t max_addr,
|
||||||
|
int nid);
|
||||||
void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
|
void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
|
||||||
phys_addr_t min_addr, phys_addr_t max_addr,
|
phys_addr_t min_addr, phys_addr_t max_addr,
|
||||||
int nid);
|
int nid);
|
||||||
|
|
|
@ -58,7 +58,6 @@ enum mem_cgroup_protection {
|
||||||
|
|
||||||
struct mem_cgroup_reclaim_cookie {
|
struct mem_cgroup_reclaim_cookie {
|
||||||
pg_data_t *pgdat;
|
pg_data_t *pgdat;
|
||||||
int priority;
|
|
||||||
unsigned int generation;
|
unsigned int generation;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -81,7 +80,6 @@ struct mem_cgroup_id {
|
||||||
enum mem_cgroup_events_target {
|
enum mem_cgroup_events_target {
|
||||||
MEM_CGROUP_TARGET_THRESH,
|
MEM_CGROUP_TARGET_THRESH,
|
||||||
MEM_CGROUP_TARGET_SOFTLIMIT,
|
MEM_CGROUP_TARGET_SOFTLIMIT,
|
||||||
MEM_CGROUP_TARGET_NUMAINFO,
|
|
||||||
MEM_CGROUP_NTARGETS,
|
MEM_CGROUP_NTARGETS,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -112,7 +110,7 @@ struct memcg_shrinker_map {
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* per-zone information in memory controller.
|
* per-node information in memory controller.
|
||||||
*/
|
*/
|
||||||
struct mem_cgroup_per_node {
|
struct mem_cgroup_per_node {
|
||||||
struct lruvec lruvec;
|
struct lruvec lruvec;
|
||||||
|
@ -126,7 +124,7 @@ struct mem_cgroup_per_node {
|
||||||
|
|
||||||
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
|
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
|
||||||
|
|
||||||
struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
|
struct mem_cgroup_reclaim_iter iter;
|
||||||
|
|
||||||
struct memcg_shrinker_map __rcu *shrinker_map;
|
struct memcg_shrinker_map __rcu *shrinker_map;
|
||||||
|
|
||||||
|
@ -134,9 +132,6 @@ struct mem_cgroup_per_node {
|
||||||
unsigned long usage_in_excess;/* Set to the value by which */
|
unsigned long usage_in_excess;/* Set to the value by which */
|
||||||
/* the soft limit is exceeded*/
|
/* the soft limit is exceeded*/
|
||||||
bool on_tree;
|
bool on_tree;
|
||||||
bool congested; /* memcg has many dirty pages */
|
|
||||||
/* backed by a congested BDI */
|
|
||||||
|
|
||||||
struct mem_cgroup *memcg; /* Back pointer, we cannot */
|
struct mem_cgroup *memcg; /* Back pointer, we cannot */
|
||||||
/* use container_of */
|
/* use container_of */
|
||||||
};
|
};
|
||||||
|
@ -313,13 +308,6 @@ struct mem_cgroup {
|
||||||
struct list_head kmem_caches;
|
struct list_head kmem_caches;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int last_scanned_node;
|
|
||||||
#if MAX_NUMNODES > 1
|
|
||||||
nodemask_t scan_nodes;
|
|
||||||
atomic_t numainfo_events;
|
|
||||||
atomic_t numainfo_updating;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||||
struct list_head cgwb_list;
|
struct list_head cgwb_list;
|
||||||
struct wb_domain cgwb_domain;
|
struct wb_domain cgwb_domain;
|
||||||
|
@ -394,25 +382,27 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone
|
* mem_cgroup_lruvec - get the lru list vector for a memcg & node
|
||||||
* @node: node of the wanted lruvec
|
|
||||||
* @memcg: memcg of the wanted lruvec
|
* @memcg: memcg of the wanted lruvec
|
||||||
*
|
*
|
||||||
* Returns the lru list vector holding pages for a given @node or a given
|
* Returns the lru list vector holding pages for a given @memcg &
|
||||||
* @memcg and @zone. This can be the node lruvec, if the memory controller
|
* @node combination. This can be the node lruvec, if the memory
|
||||||
* is disabled.
|
* controller is disabled.
|
||||||
*/
|
*/
|
||||||
static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
|
static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
|
||||||
struct mem_cgroup *memcg)
|
struct pglist_data *pgdat)
|
||||||
{
|
{
|
||||||
struct mem_cgroup_per_node *mz;
|
struct mem_cgroup_per_node *mz;
|
||||||
struct lruvec *lruvec;
|
struct lruvec *lruvec;
|
||||||
|
|
||||||
if (mem_cgroup_disabled()) {
|
if (mem_cgroup_disabled()) {
|
||||||
lruvec = node_lruvec(pgdat);
|
lruvec = &pgdat->__lruvec;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!memcg)
|
||||||
|
memcg = root_mem_cgroup;
|
||||||
|
|
||||||
mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
|
mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
|
||||||
lruvec = &mz->lruvec;
|
lruvec = &mz->lruvec;
|
||||||
out:
|
out:
|
||||||
|
@ -728,7 +718,7 @@ static inline void __mod_lruvec_page_state(struct page *page,
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
lruvec = mem_cgroup_lruvec(pgdat, page->mem_cgroup);
|
lruvec = mem_cgroup_lruvec(page->mem_cgroup, pgdat);
|
||||||
__mod_lruvec_state(lruvec, idx, val);
|
__mod_lruvec_state(lruvec, idx, val);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -899,16 +889,21 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
|
static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
|
||||||
struct mem_cgroup *memcg)
|
struct pglist_data *pgdat)
|
||||||
{
|
{
|
||||||
return node_lruvec(pgdat);
|
return &pgdat->__lruvec;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
|
static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
|
||||||
struct pglist_data *pgdat)
|
struct pglist_data *pgdat)
|
||||||
{
|
{
|
||||||
return &pgdat->lruvec;
|
return &pgdat->__lruvec;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
|
||||||
|
{
|
||||||
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool mm_match_cgroup(struct mm_struct *mm,
|
static inline bool mm_match_cgroup(struct mm_struct *mm,
|
||||||
|
|
|
@ -102,13 +102,10 @@ extern unsigned long __offline_isolated_pages(unsigned long start_pfn,
|
||||||
|
|
||||||
typedef void (*online_page_callback_t)(struct page *page, unsigned int order);
|
typedef void (*online_page_callback_t)(struct page *page, unsigned int order);
|
||||||
|
|
||||||
|
extern void generic_online_page(struct page *page, unsigned int order);
|
||||||
extern int set_online_page_callback(online_page_callback_t callback);
|
extern int set_online_page_callback(online_page_callback_t callback);
|
||||||
extern int restore_online_page_callback(online_page_callback_t callback);
|
extern int restore_online_page_callback(online_page_callback_t callback);
|
||||||
|
|
||||||
extern void __online_page_set_limits(struct page *page);
|
|
||||||
extern void __online_page_increment_counters(struct page *page);
|
|
||||||
extern void __online_page_free(struct page *page);
|
|
||||||
|
|
||||||
extern int try_online_node(int nid);
|
extern int try_online_node(int nid);
|
||||||
|
|
||||||
extern int arch_add_memory(int nid, u64 start, u64 size,
|
extern int arch_add_memory(int nid, u64 start, u64 size,
|
||||||
|
@ -229,9 +226,6 @@ void put_online_mems(void);
|
||||||
void mem_hotplug_begin(void);
|
void mem_hotplug_begin(void);
|
||||||
void mem_hotplug_done(void);
|
void mem_hotplug_done(void);
|
||||||
|
|
||||||
extern void set_zone_contiguous(struct zone *zone);
|
|
||||||
extern void clear_zone_contiguous(struct zone *zone);
|
|
||||||
|
|
||||||
#else /* ! CONFIG_MEMORY_HOTPLUG */
|
#else /* ! CONFIG_MEMORY_HOTPLUG */
|
||||||
#define pfn_to_online_page(pfn) \
|
#define pfn_to_online_page(pfn) \
|
||||||
({ \
|
({ \
|
||||||
|
@ -339,6 +333,9 @@ static inline int remove_memory(int nid, u64 start, u64 size)
|
||||||
static inline void __remove_memory(int nid, u64 start, u64 size) {}
|
static inline void __remove_memory(int nid, u64 start, u64 size) {}
|
||||||
#endif /* CONFIG_MEMORY_HOTREMOVE */
|
#endif /* CONFIG_MEMORY_HOTREMOVE */
|
||||||
|
|
||||||
|
extern void set_zone_contiguous(struct zone *zone);
|
||||||
|
extern void clear_zone_contiguous(struct zone *zone);
|
||||||
|
|
||||||
extern void __ref free_area_init_core_hotplug(int nid);
|
extern void __ref free_area_init_core_hotplug(int nid);
|
||||||
extern int __add_memory(int nid, u64 start, u64 size);
|
extern int __add_memory(int nid, u64 start, u64 size);
|
||||||
extern int add_memory(int nid, u64 start, u64 size);
|
extern int add_memory(int nid, u64 start, u64 size);
|
||||||
|
|
|
@ -564,21 +564,6 @@ int vma_is_stack_for_current(struct vm_area_struct *vma);
|
||||||
struct mmu_gather;
|
struct mmu_gather;
|
||||||
struct inode;
|
struct inode;
|
||||||
|
|
||||||
#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
|
|
||||||
static inline int pmd_devmap(pmd_t pmd)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
static inline int pud_devmap(pud_t pud)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
static inline int pgd_devmap(pgd_t pgd)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* FIXME: take this include out, include page-flags.h in
|
* FIXME: take this include out, include page-flags.h in
|
||||||
* files which need it (119 of them)
|
* files which need it (119 of them)
|
||||||
|
@ -1643,19 +1628,27 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
|
||||||
return (unsigned long)val;
|
return (unsigned long)val;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void mm_trace_rss_stat(struct mm_struct *mm, int member, long count);
|
||||||
|
|
||||||
static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
|
static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
|
||||||
{
|
{
|
||||||
atomic_long_add(value, &mm->rss_stat.count[member]);
|
long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
|
||||||
|
|
||||||
|
mm_trace_rss_stat(mm, member, count);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void inc_mm_counter(struct mm_struct *mm, int member)
|
static inline void inc_mm_counter(struct mm_struct *mm, int member)
|
||||||
{
|
{
|
||||||
atomic_long_inc(&mm->rss_stat.count[member]);
|
long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
|
||||||
|
|
||||||
|
mm_trace_rss_stat(mm, member, count);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void dec_mm_counter(struct mm_struct *mm, int member)
|
static inline void dec_mm_counter(struct mm_struct *mm, int member)
|
||||||
{
|
{
|
||||||
atomic_long_dec(&mm->rss_stat.count[member]);
|
long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
|
||||||
|
|
||||||
|
mm_trace_rss_stat(mm, member, count);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Optimized variant when page is already known not to be PageAnon */
|
/* Optimized variant when page is already known not to be PageAnon */
|
||||||
|
@ -2214,9 +2207,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
|
||||||
|
|
||||||
extern void setup_per_cpu_pageset(void);
|
extern void setup_per_cpu_pageset(void);
|
||||||
|
|
||||||
extern void zone_pcp_update(struct zone *zone);
|
|
||||||
extern void zone_pcp_reset(struct zone *zone);
|
|
||||||
|
|
||||||
/* page_alloc.c */
|
/* page_alloc.c */
|
||||||
extern int min_free_kbytes;
|
extern int min_free_kbytes;
|
||||||
extern int watermark_boost_factor;
|
extern int watermark_boost_factor;
|
||||||
|
@ -2780,7 +2770,7 @@ extern int sysctl_memory_failure_early_kill;
|
||||||
extern int sysctl_memory_failure_recovery;
|
extern int sysctl_memory_failure_recovery;
|
||||||
extern void shake_page(struct page *p, int access);
|
extern void shake_page(struct page *p, int access);
|
||||||
extern atomic_long_t num_poisoned_pages __read_mostly;
|
extern atomic_long_t num_poisoned_pages __read_mostly;
|
||||||
extern int soft_offline_page(struct page *page, int flags);
|
extern int soft_offline_page(unsigned long pfn, int flags);
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -273,12 +273,12 @@ enum lru_list {
|
||||||
|
|
||||||
#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
|
#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
|
||||||
|
|
||||||
static inline int is_file_lru(enum lru_list lru)
|
static inline bool is_file_lru(enum lru_list lru)
|
||||||
{
|
{
|
||||||
return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
|
return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int is_active_lru(enum lru_list lru)
|
static inline bool is_active_lru(enum lru_list lru)
|
||||||
{
|
{
|
||||||
return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
|
return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
|
||||||
}
|
}
|
||||||
|
@ -296,6 +296,12 @@ struct zone_reclaim_stat {
|
||||||
unsigned long recent_scanned[2];
|
unsigned long recent_scanned[2];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum lruvec_flags {
|
||||||
|
LRUVEC_CONGESTED, /* lruvec has many dirty pages
|
||||||
|
* backed by a congested BDI
|
||||||
|
*/
|
||||||
|
};
|
||||||
|
|
||||||
struct lruvec {
|
struct lruvec {
|
||||||
struct list_head lists[NR_LRU_LISTS];
|
struct list_head lists[NR_LRU_LISTS];
|
||||||
struct zone_reclaim_stat reclaim_stat;
|
struct zone_reclaim_stat reclaim_stat;
|
||||||
|
@ -303,12 +309,14 @@ struct lruvec {
|
||||||
atomic_long_t inactive_age;
|
atomic_long_t inactive_age;
|
||||||
/* Refaults at the time of last reclaim cycle */
|
/* Refaults at the time of last reclaim cycle */
|
||||||
unsigned long refaults;
|
unsigned long refaults;
|
||||||
|
/* Various lruvec state flags (enum lruvec_flags) */
|
||||||
|
unsigned long flags;
|
||||||
#ifdef CONFIG_MEMCG
|
#ifdef CONFIG_MEMCG
|
||||||
struct pglist_data *pgdat;
|
struct pglist_data *pgdat;
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Isolate unmapped file */
|
/* Isolate unmapped pages */
|
||||||
#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2)
|
#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2)
|
||||||
/* Isolate for asynchronous migration */
|
/* Isolate for asynchronous migration */
|
||||||
#define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4)
|
#define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4)
|
||||||
|
@ -572,9 +580,6 @@ struct zone {
|
||||||
} ____cacheline_internodealigned_in_smp;
|
} ____cacheline_internodealigned_in_smp;
|
||||||
|
|
||||||
enum pgdat_flags {
|
enum pgdat_flags {
|
||||||
PGDAT_CONGESTED, /* pgdat has many dirty pages backed by
|
|
||||||
* a congested BDI
|
|
||||||
*/
|
|
||||||
PGDAT_DIRTY, /* reclaim scanning has recently found
|
PGDAT_DIRTY, /* reclaim scanning has recently found
|
||||||
* many dirty file pages at the tail
|
* many dirty file pages at the tail
|
||||||
* of the LRU.
|
* of the LRU.
|
||||||
|
@ -777,7 +782,13 @@ typedef struct pglist_data {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Fields commonly accessed by the page reclaim scanner */
|
/* Fields commonly accessed by the page reclaim scanner */
|
||||||
struct lruvec lruvec;
|
|
||||||
|
/*
|
||||||
|
* NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
|
||||||
|
*
|
||||||
|
* Use mem_cgroup_lruvec() to look up lruvecs.
|
||||||
|
*/
|
||||||
|
struct lruvec __lruvec;
|
||||||
|
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
|
||||||
|
@ -800,11 +811,6 @@ typedef struct pglist_data {
|
||||||
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
|
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
|
||||||
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
|
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
|
||||||
|
|
||||||
static inline struct lruvec *node_lruvec(struct pglist_data *pgdat)
|
|
||||||
{
|
|
||||||
return &pgdat->lruvec;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
|
static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
|
||||||
{
|
{
|
||||||
return pgdat->node_start_pfn + pgdat->node_spanned_pages;
|
return pgdat->node_start_pfn + pgdat->node_spanned_pages;
|
||||||
|
@ -842,7 +848,7 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
|
||||||
#ifdef CONFIG_MEMCG
|
#ifdef CONFIG_MEMCG
|
||||||
return lruvec->pgdat;
|
return lruvec->pgdat;
|
||||||
#else
|
#else
|
||||||
return container_of(lruvec, struct pglist_data, lruvec);
|
return container_of(lruvec, struct pglist_data, __lruvec);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1079,7 +1085,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
|
||||||
/**
|
/**
|
||||||
* for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
|
* for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
|
||||||
* @zone - The current zone in the iterator
|
* @zone - The current zone in the iterator
|
||||||
* @z - The current pointer within zonelist->zones being iterated
|
* @z - The current pointer within zonelist->_zonerefs being iterated
|
||||||
* @zlist - The zonelist being iterated
|
* @zlist - The zonelist being iterated
|
||||||
* @highidx - The zone index of the highest zone to return
|
* @highidx - The zone index of the highest zone to return
|
||||||
* @nodemask - Nodemask allowed by the allocator
|
* @nodemask - Nodemask allowed by the allocator
|
||||||
|
|
|
@ -91,7 +91,7 @@ void module_arch_cleanup(struct module *mod);
|
||||||
/* Any cleanup before freeing mod->module_init */
|
/* Any cleanup before freeing mod->module_init */
|
||||||
void module_arch_freeing_init(struct module *mod);
|
void module_arch_freeing_init(struct module *mod);
|
||||||
|
|
||||||
#ifdef CONFIG_KASAN
|
#if defined(CONFIG_KASAN) && !defined(CONFIG_KASAN_VMALLOC)
|
||||||
#include <linux/kasan.h>
|
#include <linux/kasan.h>
|
||||||
#define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
|
#define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -30,7 +30,7 @@ static inline bool is_migrate_isolate(int migratetype)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define SKIP_HWPOISON 0x1
|
#define MEMORY_OFFLINE 0x1
|
||||||
#define REPORT_FAILURE 0x2
|
#define REPORT_FAILURE 0x2
|
||||||
|
|
||||||
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
|
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
|
||||||
|
@ -58,7 +58,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
|
||||||
* Test all pages in [start_pfn, end_pfn) are isolated or not.
|
* Test all pages in [start_pfn, end_pfn) are isolated or not.
|
||||||
*/
|
*/
|
||||||
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
|
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
|
||||||
bool skip_hwpoisoned_pages);
|
int isol_flags);
|
||||||
|
|
||||||
struct page *alloc_migrate_target(struct page *page, unsigned long private);
|
struct page *alloc_migrate_target(struct page *page, unsigned long private);
|
||||||
|
|
||||||
|
|
|
@ -561,26 +561,6 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
|
||||||
return __kmalloc(size, flags);
|
return __kmalloc(size, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Determine size used for the nth kmalloc cache.
|
|
||||||
* return size or 0 if a kmalloc cache for that
|
|
||||||
* size does not exist
|
|
||||||
*/
|
|
||||||
static __always_inline unsigned int kmalloc_size(unsigned int n)
|
|
||||||
{
|
|
||||||
#ifndef CONFIG_SLOB
|
|
||||||
if (n > 2)
|
|
||||||
return 1U << n;
|
|
||||||
|
|
||||||
if (n == 1 && KMALLOC_MIN_SIZE <= 32)
|
|
||||||
return 96;
|
|
||||||
|
|
||||||
if (n == 2 && KMALLOC_MIN_SIZE <= 64)
|
|
||||||
return 192;
|
|
||||||
#endif
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
|
static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
|
||||||
{
|
{
|
||||||
#ifndef CONFIG_SLOB
|
#ifndef CONFIG_SLOB
|
||||||
|
|
|
@ -216,6 +216,8 @@ int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4);
|
||||||
extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
|
extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
|
||||||
const void *from, size_t available);
|
const void *from, size_t available);
|
||||||
|
|
||||||
|
int ptr_to_hashval(const void *ptr, unsigned long *hashval_out);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* strstarts - does @str start with @prefix?
|
* strstarts - does @str start with @prefix?
|
||||||
* @str: string to examine
|
* @str: string to examine
|
||||||
|
|
|
@ -307,7 +307,7 @@ struct vma_swap_readahead {
|
||||||
};
|
};
|
||||||
|
|
||||||
/* linux/mm/workingset.c */
|
/* linux/mm/workingset.c */
|
||||||
void *workingset_eviction(struct page *page);
|
void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg);
|
||||||
void workingset_refault(struct page *page, void *shadow);
|
void workingset_refault(struct page *page, void *shadow);
|
||||||
void workingset_activation(struct page *page);
|
void workingset_activation(struct page *page);
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,18 @@ struct notifier_block; /* in notifier.h */
|
||||||
#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
|
#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
|
||||||
#define VM_NO_GUARD 0x00000040 /* don't add guard page */
|
#define VM_NO_GUARD 0x00000040 /* don't add guard page */
|
||||||
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
|
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
|
||||||
|
*
|
||||||
|
* If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after
|
||||||
|
* shadow memory has been mapped. It's used to handle allocation errors so that
|
||||||
|
* we don't try to poision shadow on free if it was never allocated.
|
||||||
|
*
|
||||||
|
* Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to
|
||||||
|
* determine which allocations need the module shadow freed.
|
||||||
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with
|
* Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with
|
||||||
* vfree_atomic().
|
* vfree_atomic().
|
||||||
|
|
|
@ -316,6 +316,53 @@ TRACE_EVENT(mm_page_alloc_extfrag,
|
||||||
__entry->change_ownership)
|
__entry->change_ownership)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Required for uniquely and securely identifying mm in rss_stat tracepoint.
|
||||||
|
*/
|
||||||
|
#ifndef __PTR_TO_HASHVAL
|
||||||
|
static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
unsigned long hashval;
|
||||||
|
|
||||||
|
ret = ptr_to_hashval(ptr, &hashval);
|
||||||
|
if (ret)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
/* The hashed value is only 32-bit */
|
||||||
|
return (unsigned int)hashval;
|
||||||
|
}
|
||||||
|
#define __PTR_TO_HASHVAL
|
||||||
|
#endif
|
||||||
|
|
||||||
|
TRACE_EVENT(rss_stat,
|
||||||
|
|
||||||
|
TP_PROTO(struct mm_struct *mm,
|
||||||
|
int member,
|
||||||
|
long count),
|
||||||
|
|
||||||
|
TP_ARGS(mm, member, count),
|
||||||
|
|
||||||
|
TP_STRUCT__entry(
|
||||||
|
__field(unsigned int, mm_id)
|
||||||
|
__field(unsigned int, curr)
|
||||||
|
__field(int, member)
|
||||||
|
__field(long, size)
|
||||||
|
),
|
||||||
|
|
||||||
|
TP_fast_assign(
|
||||||
|
__entry->mm_id = mm_ptr_to_hash(mm);
|
||||||
|
__entry->curr = !!(current->mm == mm);
|
||||||
|
__entry->member = member;
|
||||||
|
__entry->size = (count << PAGE_SHIFT);
|
||||||
|
),
|
||||||
|
|
||||||
|
TP_printk("mm_id=%u curr=%d member=%d size=%ldB",
|
||||||
|
__entry->mm_id,
|
||||||
|
__entry->curr,
|
||||||
|
__entry->member,
|
||||||
|
__entry->size)
|
||||||
|
);
|
||||||
#endif /* _TRACE_KMEM_H */
|
#endif /* _TRACE_KMEM_H */
|
||||||
|
|
||||||
/* This part must be outside protection */
|
/* This part must be outside protection */
|
||||||
|
|
|
@ -1457,7 +1457,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
|
||||||
/* Try to map as high as possible, this is only a hint. */
|
/* Try to map as high as possible, this is only a hint. */
|
||||||
area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
|
area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
|
||||||
PAGE_SIZE, 0, 0);
|
PAGE_SIZE, 0, 0);
|
||||||
if (area->vaddr & ~PAGE_MASK) {
|
if (IS_ERR_VALUE(area->vaddr)) {
|
||||||
ret = area->vaddr;
|
ret = area->vaddr;
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
|
|
@ -93,6 +93,7 @@
|
||||||
#include <linux/livepatch.h>
|
#include <linux/livepatch.h>
|
||||||
#include <linux/thread_info.h>
|
#include <linux/thread_info.h>
|
||||||
#include <linux/stackleak.h>
|
#include <linux/stackleak.h>
|
||||||
|
#include <linux/kasan.h>
|
||||||
|
|
||||||
#include <asm/pgtable.h>
|
#include <asm/pgtable.h>
|
||||||
#include <asm/pgalloc.h>
|
#include <asm/pgalloc.h>
|
||||||
|
@ -223,6 +224,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
|
||||||
if (!s)
|
if (!s)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
/* Clear the KASAN shadow of the stack. */
|
||||||
|
kasan_unpoison_shadow(s->addr, THREAD_SIZE);
|
||||||
|
|
||||||
/* Clear stale pointers from reused stack. */
|
/* Clear stale pointers from reused stack. */
|
||||||
memset(s->addr, 0, THREAD_SIZE);
|
memset(s->addr, 0, THREAD_SIZE);
|
||||||
|
|
||||||
|
|
|
@ -1466,7 +1466,7 @@ static struct ctl_table vm_table[] = {
|
||||||
.procname = "drop_caches",
|
.procname = "drop_caches",
|
||||||
.data = &sysctl_drop_caches,
|
.data = &sysctl_drop_caches,
|
||||||
.maxlen = sizeof(int),
|
.maxlen = sizeof(int),
|
||||||
.mode = 0644,
|
.mode = 0200,
|
||||||
.proc_handler = drop_caches_sysctl_handler,
|
.proc_handler = drop_caches_sysctl_handler,
|
||||||
.extra1 = SYSCTL_ONE,
|
.extra1 = SYSCTL_ONE,
|
||||||
.extra2 = &four,
|
.extra2 = &four,
|
||||||
|
|
|
@ -6,6 +6,9 @@ config HAVE_ARCH_KASAN
|
||||||
config HAVE_ARCH_KASAN_SW_TAGS
|
config HAVE_ARCH_KASAN_SW_TAGS
|
||||||
bool
|
bool
|
||||||
|
|
||||||
|
config HAVE_ARCH_KASAN_VMALLOC
|
||||||
|
bool
|
||||||
|
|
||||||
config CC_HAS_KASAN_GENERIC
|
config CC_HAS_KASAN_GENERIC
|
||||||
def_bool $(cc-option, -fsanitize=kernel-address)
|
def_bool $(cc-option, -fsanitize=kernel-address)
|
||||||
|
|
||||||
|
@ -142,6 +145,19 @@ config KASAN_SW_TAGS_IDENTIFY
|
||||||
(use-after-free or out-of-bounds) at the cost of increased
|
(use-after-free or out-of-bounds) at the cost of increased
|
||||||
memory consumption.
|
memory consumption.
|
||||||
|
|
||||||
|
config KASAN_VMALLOC
|
||||||
|
bool "Back mappings in vmalloc space with real shadow memory"
|
||||||
|
depends on KASAN && HAVE_ARCH_KASAN_VMALLOC
|
||||||
|
help
|
||||||
|
By default, the shadow region for vmalloc space is the read-only
|
||||||
|
zero page. This means that KASAN cannot detect errors involving
|
||||||
|
vmalloc space.
|
||||||
|
|
||||||
|
Enabling this option will hook in to vmap/vmalloc and back those
|
||||||
|
mappings with real shadow memory allocated on demand. This allows
|
||||||
|
for KASAN to detect more sorts of errors (and to support vmapped
|
||||||
|
stacks), but at the cost of higher memory usage.
|
||||||
|
|
||||||
config TEST_KASAN
|
config TEST_KASAN
|
||||||
tristate "Module for testing KASAN for bug detection"
|
tristate "Module for testing KASAN for bug detection"
|
||||||
depends on m && KASAN
|
depends on m && KASAN
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#include <linux/string.h>
|
#include <linux/string.h>
|
||||||
#include <linux/uaccess.h>
|
#include <linux/uaccess.h>
|
||||||
#include <linux/io.h>
|
#include <linux/io.h>
|
||||||
|
#include <linux/vmalloc.h>
|
||||||
|
|
||||||
#include <asm/page.h>
|
#include <asm/page.h>
|
||||||
|
|
||||||
|
@ -748,6 +749,30 @@ static noinline void __init kmalloc_double_kzfree(void)
|
||||||
kzfree(ptr);
|
kzfree(ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_KASAN_VMALLOC
|
||||||
|
static noinline void __init vmalloc_oob(void)
|
||||||
|
{
|
||||||
|
void *area;
|
||||||
|
|
||||||
|
pr_info("vmalloc out-of-bounds\n");
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We have to be careful not to hit the guard page.
|
||||||
|
* The MMU will catch that and crash us.
|
||||||
|
*/
|
||||||
|
area = vmalloc(3000);
|
||||||
|
if (!area) {
|
||||||
|
pr_err("Allocation failed\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
((volatile char *)area)[3100];
|
||||||
|
vfree(area);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
static void __init vmalloc_oob(void) {}
|
||||||
|
#endif
|
||||||
|
|
||||||
static int __init kmalloc_tests_init(void)
|
static int __init kmalloc_tests_init(void)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
|
@ -793,6 +818,7 @@ static int __init kmalloc_tests_init(void)
|
||||||
kasan_strings();
|
kasan_strings();
|
||||||
kasan_bitops();
|
kasan_bitops();
|
||||||
kmalloc_double_kzfree();
|
kmalloc_double_kzfree();
|
||||||
|
vmalloc_oob();
|
||||||
|
|
||||||
kasan_restore_multi_shot(multishot);
|
kasan_restore_multi_shot(multishot);
|
||||||
|
|
||||||
|
|
|
@ -761,23 +761,12 @@ static int __init initialize_ptr_random(void)
|
||||||
early_initcall(initialize_ptr_random);
|
early_initcall(initialize_ptr_random);
|
||||||
|
|
||||||
/* Maps a pointer to a 32 bit unique identifier. */
|
/* Maps a pointer to a 32 bit unique identifier. */
|
||||||
static char *ptr_to_id(char *buf, char *end, const void *ptr,
|
static inline int __ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
|
||||||
struct printf_spec spec)
|
|
||||||
{
|
{
|
||||||
const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)";
|
|
||||||
unsigned long hashval;
|
unsigned long hashval;
|
||||||
|
|
||||||
/* When debugging early boot use non-cryptographically secure hash. */
|
if (static_branch_unlikely(¬_filled_random_ptr_key))
|
||||||
if (unlikely(debug_boot_weak_hash)) {
|
return -EAGAIN;
|
||||||
hashval = hash_long((unsigned long)ptr, 32);
|
|
||||||
return pointer_string(buf, end, (const void *)hashval, spec);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (static_branch_unlikely(¬_filled_random_ptr_key)) {
|
|
||||||
spec.field_width = 2 * sizeof(ptr);
|
|
||||||
/* string length must be less than default_width */
|
|
||||||
return error_string(buf, end, str, spec);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef CONFIG_64BIT
|
#ifdef CONFIG_64BIT
|
||||||
hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key);
|
hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key);
|
||||||
|
@ -789,6 +778,35 @@ static char *ptr_to_id(char *buf, char *end, const void *ptr,
|
||||||
#else
|
#else
|
||||||
hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key);
|
hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key);
|
||||||
#endif
|
#endif
|
||||||
|
*hashval_out = hashval;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
|
||||||
|
{
|
||||||
|
return __ptr_to_hashval(ptr, hashval_out);
|
||||||
|
}
|
||||||
|
|
||||||
|
static char *ptr_to_id(char *buf, char *end, const void *ptr,
|
||||||
|
struct printf_spec spec)
|
||||||
|
{
|
||||||
|
const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)";
|
||||||
|
unsigned long hashval;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
/* When debugging early boot use non-cryptographically secure hash. */
|
||||||
|
if (unlikely(debug_boot_weak_hash)) {
|
||||||
|
hashval = hash_long((unsigned long)ptr, 32);
|
||||||
|
return pointer_string(buf, end, (const void *)hashval, spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = __ptr_to_hashval(ptr, &hashval);
|
||||||
|
if (ret) {
|
||||||
|
spec.field_width = 2 * sizeof(ptr);
|
||||||
|
/* string length must be less than default_width */
|
||||||
|
return error_string(buf, end, str, spec);
|
||||||
|
}
|
||||||
|
|
||||||
return pointer_string(buf, end, (const void *)hashval, spec);
|
return pointer_string(buf, end, (const void *)hashval, spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
40
mm/Kconfig
40
mm/Kconfig
|
@ -29,7 +29,7 @@ config FLATMEM_MANUAL
|
||||||
|
|
||||||
For systems that have holes in their physical address
|
For systems that have holes in their physical address
|
||||||
spaces and for features like NUMA and memory hotplug,
|
spaces and for features like NUMA and memory hotplug,
|
||||||
choose "Sparse Memory"
|
choose "Sparse Memory".
|
||||||
|
|
||||||
If unsure, choose this option (Flat Memory) over any other.
|
If unsure, choose this option (Flat Memory) over any other.
|
||||||
|
|
||||||
|
@ -122,9 +122,9 @@ config SPARSEMEM_VMEMMAP
|
||||||
depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
|
depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
|
||||||
default y
|
default y
|
||||||
help
|
help
|
||||||
SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
|
SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
|
||||||
pfn_to_page and page_to_pfn operations. This is the most
|
pfn_to_page and page_to_pfn operations. This is the most
|
||||||
efficient option when sufficient kernel resources are available.
|
efficient option when sufficient kernel resources are available.
|
||||||
|
|
||||||
config HAVE_MEMBLOCK_NODE_MAP
|
config HAVE_MEMBLOCK_NODE_MAP
|
||||||
bool
|
bool
|
||||||
|
@ -160,9 +160,9 @@ config MEMORY_HOTPLUG_SPARSE
|
||||||
depends on SPARSEMEM && MEMORY_HOTPLUG
|
depends on SPARSEMEM && MEMORY_HOTPLUG
|
||||||
|
|
||||||
config MEMORY_HOTPLUG_DEFAULT_ONLINE
|
config MEMORY_HOTPLUG_DEFAULT_ONLINE
|
||||||
bool "Online the newly added memory blocks by default"
|
bool "Online the newly added memory blocks by default"
|
||||||
depends on MEMORY_HOTPLUG
|
depends on MEMORY_HOTPLUG
|
||||||
help
|
help
|
||||||
This option sets the default policy setting for memory hotplug
|
This option sets the default policy setting for memory hotplug
|
||||||
onlining policy (/sys/devices/system/memory/auto_online_blocks) which
|
onlining policy (/sys/devices/system/memory/auto_online_blocks) which
|
||||||
determines what happens to newly added memory regions. Policy setting
|
determines what happens to newly added memory regions. Policy setting
|
||||||
|
@ -227,14 +227,14 @@ config COMPACTION
|
||||||
select MIGRATION
|
select MIGRATION
|
||||||
depends on MMU
|
depends on MMU
|
||||||
help
|
help
|
||||||
Compaction is the only memory management component to form
|
Compaction is the only memory management component to form
|
||||||
high order (larger physically contiguous) memory blocks
|
high order (larger physically contiguous) memory blocks
|
||||||
reliably. The page allocator relies on compaction heavily and
|
reliably. The page allocator relies on compaction heavily and
|
||||||
the lack of the feature can lead to unexpected OOM killer
|
the lack of the feature can lead to unexpected OOM killer
|
||||||
invocations for high order memory requests. You shouldn't
|
invocations for high order memory requests. You shouldn't
|
||||||
disable this option unless there really is a strong reason for
|
disable this option unless there really is a strong reason for
|
||||||
it and then we would be really interested to hear about that at
|
it and then we would be really interested to hear about that at
|
||||||
linux-mm@kvack.org.
|
linux-mm@kvack.org.
|
||||||
|
|
||||||
#
|
#
|
||||||
# support for page migration
|
# support for page migration
|
||||||
|
@ -258,7 +258,7 @@ config ARCH_ENABLE_THP_MIGRATION
|
||||||
bool
|
bool
|
||||||
|
|
||||||
config CONTIG_ALLOC
|
config CONTIG_ALLOC
|
||||||
def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
|
def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
|
||||||
|
|
||||||
config PHYS_ADDR_T_64BIT
|
config PHYS_ADDR_T_64BIT
|
||||||
def_bool 64BIT
|
def_bool 64BIT
|
||||||
|
@ -302,10 +302,10 @@ config KSM
|
||||||
root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
|
root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
|
||||||
|
|
||||||
config DEFAULT_MMAP_MIN_ADDR
|
config DEFAULT_MMAP_MIN_ADDR
|
||||||
int "Low address space to protect from user allocation"
|
int "Low address space to protect from user allocation"
|
||||||
depends on MMU
|
depends on MMU
|
||||||
default 4096
|
default 4096
|
||||||
help
|
help
|
||||||
This is the portion of low virtual memory which should be protected
|
This is the portion of low virtual memory which should be protected
|
||||||
from userspace allocation. Keeping a user from writing to low pages
|
from userspace allocation. Keeping a user from writing to low pages
|
||||||
can help reduce the impact of kernel NULL pointer bugs.
|
can help reduce the impact of kernel NULL pointer bugs.
|
||||||
|
@ -408,7 +408,7 @@ choice
|
||||||
endchoice
|
endchoice
|
||||||
|
|
||||||
config ARCH_WANTS_THP_SWAP
|
config ARCH_WANTS_THP_SWAP
|
||||||
def_bool n
|
def_bool n
|
||||||
|
|
||||||
config THP_SWAP
|
config THP_SWAP
|
||||||
def_bool y
|
def_bool y
|
||||||
|
|
6
mm/cma.c
6
mm/cma.c
|
@ -95,13 +95,11 @@ static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
|
||||||
|
|
||||||
static int __init cma_activate_area(struct cma *cma)
|
static int __init cma_activate_area(struct cma *cma)
|
||||||
{
|
{
|
||||||
int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long);
|
|
||||||
unsigned long base_pfn = cma->base_pfn, pfn = base_pfn;
|
unsigned long base_pfn = cma->base_pfn, pfn = base_pfn;
|
||||||
unsigned i = cma->count >> pageblock_order;
|
unsigned i = cma->count >> pageblock_order;
|
||||||
struct zone *zone;
|
struct zone *zone;
|
||||||
|
|
||||||
cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
|
cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL);
|
||||||
|
|
||||||
if (!cma->bitmap) {
|
if (!cma->bitmap) {
|
||||||
cma->count = 0;
|
cma->count = 0;
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
@ -139,7 +137,7 @@ static int __init cma_activate_area(struct cma *cma)
|
||||||
|
|
||||||
not_in_zone:
|
not_in_zone:
|
||||||
pr_err("CMA area %s could not be activated\n", cma->name);
|
pr_err("CMA area %s could not be activated\n", cma->name);
|
||||||
kfree(cma->bitmap);
|
bitmap_free(cma->bitmap);
|
||||||
cma->count = 0;
|
cma->count = 0;
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,7 +29,7 @@ static int cma_debugfs_get(void *data, u64 *val)
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
|
DEFINE_DEBUGFS_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
|
||||||
|
|
||||||
static int cma_used_get(void *data, u64 *val)
|
static int cma_used_get(void *data, u64 *val)
|
||||||
{
|
{
|
||||||
|
@ -44,7 +44,7 @@ static int cma_used_get(void *data, u64 *val)
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n");
|
DEFINE_DEBUGFS_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n");
|
||||||
|
|
||||||
static int cma_maxchunk_get(void *data, u64 *val)
|
static int cma_maxchunk_get(void *data, u64 *val)
|
||||||
{
|
{
|
||||||
|
@ -66,7 +66,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n");
|
DEFINE_DEBUGFS_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n");
|
||||||
|
|
||||||
static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem)
|
static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem)
|
||||||
{
|
{
|
||||||
|
@ -126,7 +126,7 @@ static int cma_free_write(void *data, u64 val)
|
||||||
|
|
||||||
return cma_free_mem(cma, pages);
|
return cma_free_mem(cma, pages);
|
||||||
}
|
}
|
||||||
DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n");
|
DEFINE_DEBUGFS_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n");
|
||||||
|
|
||||||
static int cma_alloc_mem(struct cma *cma, int count)
|
static int cma_alloc_mem(struct cma *cma, int count)
|
||||||
{
|
{
|
||||||
|
@ -158,7 +158,7 @@ static int cma_alloc_write(void *data, u64 val)
|
||||||
|
|
||||||
return cma_alloc_mem(cma, pages);
|
return cma_alloc_mem(cma, pages);
|
||||||
}
|
}
|
||||||
DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
|
DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
|
||||||
|
|
||||||
static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
|
static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
|
||||||
{
|
{
|
||||||
|
|
54
mm/filemap.c
54
mm/filemap.c
|
@ -2329,27 +2329,6 @@ EXPORT_SYMBOL(generic_file_read_iter);
|
||||||
|
|
||||||
#ifdef CONFIG_MMU
|
#ifdef CONFIG_MMU
|
||||||
#define MMAP_LOTSAMISS (100)
|
#define MMAP_LOTSAMISS (100)
|
||||||
static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
|
|
||||||
struct file *fpin)
|
|
||||||
{
|
|
||||||
int flags = vmf->flags;
|
|
||||||
|
|
||||||
if (fpin)
|
|
||||||
return fpin;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
|
|
||||||
* anything, so we only pin the file and drop the mmap_sem if only
|
|
||||||
* FAULT_FLAG_ALLOW_RETRY is set.
|
|
||||||
*/
|
|
||||||
if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
|
|
||||||
FAULT_FLAG_ALLOW_RETRY) {
|
|
||||||
fpin = get_file(vmf->vma->vm_file);
|
|
||||||
up_read(&vmf->vma->vm_mm->mmap_sem);
|
|
||||||
}
|
|
||||||
return fpin;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
|
* lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
|
||||||
* @vmf - the vm_fault for this fault.
|
* @vmf - the vm_fault for this fault.
|
||||||
|
@ -3161,6 +3140,27 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(pagecache_write_end);
|
EXPORT_SYMBOL(pagecache_write_end);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Warn about a page cache invalidation failure during a direct I/O write.
|
||||||
|
*/
|
||||||
|
void dio_warn_stale_pagecache(struct file *filp)
|
||||||
|
{
|
||||||
|
static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
|
||||||
|
char pathname[128];
|
||||||
|
struct inode *inode = file_inode(filp);
|
||||||
|
char *path;
|
||||||
|
|
||||||
|
errseq_set(&inode->i_mapping->wb_err, -EIO);
|
||||||
|
if (__ratelimit(&_rs)) {
|
||||||
|
path = file_path(filp, pathname, sizeof(pathname));
|
||||||
|
if (IS_ERR(path))
|
||||||
|
path = "(unknown)";
|
||||||
|
pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");
|
||||||
|
pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
|
||||||
|
current->comm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ssize_t
|
ssize_t
|
||||||
generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
|
generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
|
||||||
{
|
{
|
||||||
|
@ -3218,11 +3218,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
|
||||||
* Most of the time we do not need this since dio_complete() will do
|
* Most of the time we do not need this since dio_complete() will do
|
||||||
* the invalidation for us. However there are some file systems that
|
* the invalidation for us. However there are some file systems that
|
||||||
* do not end up with dio_complete() being called, so let's not break
|
* do not end up with dio_complete() being called, so let's not break
|
||||||
* them by removing it completely
|
* them by removing it completely.
|
||||||
|
*
|
||||||
|
* Noticeable example is a blkdev_direct_IO().
|
||||||
|
*
|
||||||
|
* Skip invalidation for async writes or if mapping has no pages.
|
||||||
*/
|
*/
|
||||||
if (mapping->nrpages)
|
if (written > 0 && mapping->nrpages &&
|
||||||
invalidate_inode_pages2_range(mapping,
|
invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
|
||||||
pos >> PAGE_SHIFT, end);
|
dio_warn_stale_pagecache(file);
|
||||||
|
|
||||||
if (written > 0) {
|
if (written > 0) {
|
||||||
pos += written;
|
pos += written;
|
||||||
|
|
40
mm/gup.c
40
mm/gup.c
|
@ -734,11 +734,17 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
|
||||||
* Or NULL if the caller does not require them.
|
* Or NULL if the caller does not require them.
|
||||||
* @nonblocking: whether waiting for disk IO or mmap_sem contention
|
* @nonblocking: whether waiting for disk IO or mmap_sem contention
|
||||||
*
|
*
|
||||||
* Returns number of pages pinned. This may be fewer than the number
|
* Returns either number of pages pinned (which may be less than the
|
||||||
* requested. If nr_pages is 0 or negative, returns 0. If no pages
|
* number requested), or an error. Details about the return value:
|
||||||
* were pinned, returns -errno. Each page returned must be released
|
*
|
||||||
* with a put_page() call when it is finished with. vmas will only
|
* -- If nr_pages is 0, returns 0.
|
||||||
* remain valid while mmap_sem is held.
|
* -- If nr_pages is >0, but no pages were pinned, returns -errno.
|
||||||
|
* -- If nr_pages is >0, and some pages were pinned, returns the number of
|
||||||
|
* pages pinned. Again, this may be less than nr_pages.
|
||||||
|
*
|
||||||
|
* The caller is responsible for releasing returned @pages, via put_page().
|
||||||
|
*
|
||||||
|
* @vmas are valid only as long as mmap_sem is held.
|
||||||
*
|
*
|
||||||
* Must be called with mmap_sem held. It may be released. See below.
|
* Must be called with mmap_sem held. It may be released. See below.
|
||||||
*
|
*
|
||||||
|
@ -1107,11 +1113,17 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
|
||||||
* subsequently whether VM_FAULT_RETRY functionality can be
|
* subsequently whether VM_FAULT_RETRY functionality can be
|
||||||
* utilised. Lock must initially be held.
|
* utilised. Lock must initially be held.
|
||||||
*
|
*
|
||||||
* Returns number of pages pinned. This may be fewer than the number
|
* Returns either number of pages pinned (which may be less than the
|
||||||
* requested. If nr_pages is 0 or negative, returns 0. If no pages
|
* number requested), or an error. Details about the return value:
|
||||||
* were pinned, returns -errno. Each page returned must be released
|
*
|
||||||
* with a put_page() call when it is finished with. vmas will only
|
* -- If nr_pages is 0, returns 0.
|
||||||
* remain valid while mmap_sem is held.
|
* -- If nr_pages is >0, but no pages were pinned, returns -errno.
|
||||||
|
* -- If nr_pages is >0, and some pages were pinned, returns the number of
|
||||||
|
* pages pinned. Again, this may be less than nr_pages.
|
||||||
|
*
|
||||||
|
* The caller is responsible for releasing returned @pages, via put_page().
|
||||||
|
*
|
||||||
|
* @vmas are valid only as long as mmap_sem is held.
|
||||||
*
|
*
|
||||||
* Must be called with mmap_sem held for read or write.
|
* Must be called with mmap_sem held for read or write.
|
||||||
*
|
*
|
||||||
|
@ -1443,6 +1455,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
|
||||||
bool drain_allow = true;
|
bool drain_allow = true;
|
||||||
bool migrate_allow = true;
|
bool migrate_allow = true;
|
||||||
LIST_HEAD(cma_page_list);
|
LIST_HEAD(cma_page_list);
|
||||||
|
long ret = nr_pages;
|
||||||
|
|
||||||
check_again:
|
check_again:
|
||||||
for (i = 0; i < nr_pages;) {
|
for (i = 0; i < nr_pages;) {
|
||||||
|
@ -1504,17 +1517,18 @@ check_again:
|
||||||
* again migrating any new CMA pages which we failed to isolate
|
* again migrating any new CMA pages which we failed to isolate
|
||||||
* earlier.
|
* earlier.
|
||||||
*/
|
*/
|
||||||
nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
|
ret = __get_user_pages_locked(tsk, mm, start, nr_pages,
|
||||||
pages, vmas, NULL,
|
pages, vmas, NULL,
|
||||||
gup_flags);
|
gup_flags);
|
||||||
|
|
||||||
if ((nr_pages > 0) && migrate_allow) {
|
if ((ret > 0) && migrate_allow) {
|
||||||
|
nr_pages = ret;
|
||||||
drain_allow = true;
|
drain_allow = true;
|
||||||
goto check_again;
|
goto check_again;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return nr_pages;
|
return ret;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
static long check_and_migrate_cma_pages(struct task_struct *tsk,
|
static long check_and_migrate_cma_pages(struct task_struct *tsk,
|
||||||
|
|
|
@ -3003,7 +3003,7 @@ next:
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
|
DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
|
||||||
"%llu\n");
|
"%llu\n");
|
||||||
|
|
||||||
static int __init split_huge_pages_debugfs(void)
|
static int __init split_huge_pages_debugfs(void)
|
||||||
|
|
288
mm/hugetlb.c
288
mm/hugetlb.c
|
@ -244,16 +244,66 @@ struct file_region {
|
||||||
long to;
|
long to;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* Must be called with resv->lock held. Calling this with count_only == true
|
||||||
|
* will count the number of pages to be added but will not modify the linked
|
||||||
|
* list.
|
||||||
|
*/
|
||||||
|
static long add_reservation_in_range(struct resv_map *resv, long f, long t,
|
||||||
|
bool count_only)
|
||||||
|
{
|
||||||
|
long chg = 0;
|
||||||
|
struct list_head *head = &resv->regions;
|
||||||
|
struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
|
||||||
|
|
||||||
|
/* Locate the region we are before or in. */
|
||||||
|
list_for_each_entry(rg, head, link)
|
||||||
|
if (f <= rg->to)
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* Round our left edge to the current segment if it encloses us. */
|
||||||
|
if (f > rg->from)
|
||||||
|
f = rg->from;
|
||||||
|
|
||||||
|
chg = t - f;
|
||||||
|
|
||||||
|
/* Check for and consume any regions we now overlap with. */
|
||||||
|
nrg = rg;
|
||||||
|
list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
|
||||||
|
if (&rg->link == head)
|
||||||
|
break;
|
||||||
|
if (rg->from > t)
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* We overlap with this area, if it extends further than
|
||||||
|
* us then we must extend ourselves. Account for its
|
||||||
|
* existing reservation.
|
||||||
|
*/
|
||||||
|
if (rg->to > t) {
|
||||||
|
chg += rg->to - t;
|
||||||
|
t = rg->to;
|
||||||
|
}
|
||||||
|
chg -= rg->to - rg->from;
|
||||||
|
|
||||||
|
if (!count_only && rg != nrg) {
|
||||||
|
list_del(&rg->link);
|
||||||
|
kfree(rg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!count_only) {
|
||||||
|
nrg->from = f;
|
||||||
|
nrg->to = t;
|
||||||
|
}
|
||||||
|
|
||||||
|
return chg;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Add the huge page range represented by [f, t) to the reserve
|
* Add the huge page range represented by [f, t) to the reserve
|
||||||
* map. In the normal case, existing regions will be expanded
|
* map. Existing regions will be expanded to accommodate the specified
|
||||||
* to accommodate the specified range. Sufficient regions should
|
* range, or a region will be taken from the cache. Sufficient regions
|
||||||
* exist for expansion due to the previous call to region_chg
|
* must exist in the cache due to the previous call to region_chg with
|
||||||
* with the same range. However, it is possible that region_del
|
* the same range.
|
||||||
* could have been called after region_chg and modifed the map
|
|
||||||
* in such a way that no region exists to be expanded. In this
|
|
||||||
* case, pull a region descriptor from the cache associated with
|
|
||||||
* the map and use that for the new range.
|
|
||||||
*
|
*
|
||||||
* Return the number of new huge pages added to the map. This
|
* Return the number of new huge pages added to the map. This
|
||||||
* number is greater than or equal to zero.
|
* number is greater than or equal to zero.
|
||||||
|
@ -261,7 +311,7 @@ struct file_region {
|
||||||
static long region_add(struct resv_map *resv, long f, long t)
|
static long region_add(struct resv_map *resv, long f, long t)
|
||||||
{
|
{
|
||||||
struct list_head *head = &resv->regions;
|
struct list_head *head = &resv->regions;
|
||||||
struct file_region *rg, *nrg, *trg;
|
struct file_region *rg, *nrg;
|
||||||
long add = 0;
|
long add = 0;
|
||||||
|
|
||||||
spin_lock(&resv->lock);
|
spin_lock(&resv->lock);
|
||||||
|
@ -272,9 +322,8 @@ static long region_add(struct resv_map *resv, long f, long t)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If no region exists which can be expanded to include the
|
* If no region exists which can be expanded to include the
|
||||||
* specified range, the list must have been modified by an
|
* specified range, pull a region descriptor from the cache
|
||||||
* interleving call to region_del(). Pull a region descriptor
|
* and use it for this range.
|
||||||
* from the cache and use it for this range.
|
|
||||||
*/
|
*/
|
||||||
if (&rg->link == head || t < rg->from) {
|
if (&rg->link == head || t < rg->from) {
|
||||||
VM_BUG_ON(resv->region_cache_count <= 0);
|
VM_BUG_ON(resv->region_cache_count <= 0);
|
||||||
|
@ -292,38 +341,7 @@ static long region_add(struct resv_map *resv, long f, long t)
|
||||||
goto out_locked;
|
goto out_locked;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Round our left edge to the current segment if it encloses us. */
|
add = add_reservation_in_range(resv, f, t, false);
|
||||||
if (f > rg->from)
|
|
||||||
f = rg->from;
|
|
||||||
|
|
||||||
/* Check for and consume any regions we now overlap with. */
|
|
||||||
nrg = rg;
|
|
||||||
list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
|
|
||||||
if (&rg->link == head)
|
|
||||||
break;
|
|
||||||
if (rg->from > t)
|
|
||||||
break;
|
|
||||||
|
|
||||||
/* If this area reaches higher then extend our area to
|
|
||||||
* include it completely. If this is not the first area
|
|
||||||
* which we intend to reuse, free it. */
|
|
||||||
if (rg->to > t)
|
|
||||||
t = rg->to;
|
|
||||||
if (rg != nrg) {
|
|
||||||
/* Decrement return value by the deleted range.
|
|
||||||
* Another range will span this area so that by
|
|
||||||
* end of routine add will be >= zero
|
|
||||||
*/
|
|
||||||
add -= (rg->to - rg->from);
|
|
||||||
list_del(&rg->link);
|
|
||||||
kfree(rg);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
add += (nrg->from - f); /* Added to beginning of region */
|
|
||||||
nrg->from = f;
|
|
||||||
add += t - nrg->to; /* Added to end of region */
|
|
||||||
nrg->to = t;
|
|
||||||
|
|
||||||
out_locked:
|
out_locked:
|
||||||
resv->adds_in_progress--;
|
resv->adds_in_progress--;
|
||||||
|
@ -339,15 +357,9 @@ out_locked:
|
||||||
* call to region_add that will actually modify the reserve
|
* call to region_add that will actually modify the reserve
|
||||||
* map to add the specified range [f, t). region_chg does
|
* map to add the specified range [f, t). region_chg does
|
||||||
* not change the number of huge pages represented by the
|
* not change the number of huge pages represented by the
|
||||||
* map. However, if the existing regions in the map can not
|
* map. A new file_region structure is added to the cache
|
||||||
* be expanded to represent the new range, a new file_region
|
* as a placeholder, so that the subsequent region_add
|
||||||
* structure is added to the map as a placeholder. This is
|
* call will have all the regions it needs and will not fail.
|
||||||
* so that the subsequent region_add call will have all the
|
|
||||||
* regions it needs and will not fail.
|
|
||||||
*
|
|
||||||
* Upon entry, region_chg will also examine the cache of region descriptors
|
|
||||||
* associated with the map. If there are not enough descriptors cached, one
|
|
||||||
* will be allocated for the in progress add operation.
|
|
||||||
*
|
*
|
||||||
* Returns the number of huge pages that need to be added to the existing
|
* Returns the number of huge pages that need to be added to the existing
|
||||||
* reservation map for the range [f, t). This number is greater or equal to
|
* reservation map for the range [f, t). This number is greater or equal to
|
||||||
|
@ -356,11 +368,8 @@ out_locked:
|
||||||
*/
|
*/
|
||||||
static long region_chg(struct resv_map *resv, long f, long t)
|
static long region_chg(struct resv_map *resv, long f, long t)
|
||||||
{
|
{
|
||||||
struct list_head *head = &resv->regions;
|
|
||||||
struct file_region *rg, *nrg = NULL;
|
|
||||||
long chg = 0;
|
long chg = 0;
|
||||||
|
|
||||||
retry:
|
|
||||||
spin_lock(&resv->lock);
|
spin_lock(&resv->lock);
|
||||||
retry_locked:
|
retry_locked:
|
||||||
resv->adds_in_progress++;
|
resv->adds_in_progress++;
|
||||||
|
@ -378,10 +387,8 @@ retry_locked:
|
||||||
spin_unlock(&resv->lock);
|
spin_unlock(&resv->lock);
|
||||||
|
|
||||||
trg = kmalloc(sizeof(*trg), GFP_KERNEL);
|
trg = kmalloc(sizeof(*trg), GFP_KERNEL);
|
||||||
if (!trg) {
|
if (!trg)
|
||||||
kfree(nrg);
|
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
}
|
|
||||||
|
|
||||||
spin_lock(&resv->lock);
|
spin_lock(&resv->lock);
|
||||||
list_add(&trg->link, &resv->region_cache);
|
list_add(&trg->link, &resv->region_cache);
|
||||||
|
@ -389,61 +396,8 @@ retry_locked:
|
||||||
goto retry_locked;
|
goto retry_locked;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Locate the region we are before or in. */
|
chg = add_reservation_in_range(resv, f, t, true);
|
||||||
list_for_each_entry(rg, head, link)
|
|
||||||
if (f <= rg->to)
|
|
||||||
break;
|
|
||||||
|
|
||||||
/* If we are below the current region then a new region is required.
|
|
||||||
* Subtle, allocate a new region at the position but make it zero
|
|
||||||
* size such that we can guarantee to record the reservation. */
|
|
||||||
if (&rg->link == head || t < rg->from) {
|
|
||||||
if (!nrg) {
|
|
||||||
resv->adds_in_progress--;
|
|
||||||
spin_unlock(&resv->lock);
|
|
||||||
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
|
|
||||||
if (!nrg)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
nrg->from = f;
|
|
||||||
nrg->to = f;
|
|
||||||
INIT_LIST_HEAD(&nrg->link);
|
|
||||||
goto retry;
|
|
||||||
}
|
|
||||||
|
|
||||||
list_add(&nrg->link, rg->link.prev);
|
|
||||||
chg = t - f;
|
|
||||||
goto out_nrg;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Round our left edge to the current segment if it encloses us. */
|
|
||||||
if (f > rg->from)
|
|
||||||
f = rg->from;
|
|
||||||
chg = t - f;
|
|
||||||
|
|
||||||
/* Check for and consume any regions we now overlap with. */
|
|
||||||
list_for_each_entry(rg, rg->link.prev, link) {
|
|
||||||
if (&rg->link == head)
|
|
||||||
break;
|
|
||||||
if (rg->from > t)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
/* We overlap with this area, if it extends further than
|
|
||||||
* us then we must extend ourselves. Account for its
|
|
||||||
* existing reservation. */
|
|
||||||
if (rg->to > t) {
|
|
||||||
chg += rg->to - t;
|
|
||||||
t = rg->to;
|
|
||||||
}
|
|
||||||
chg -= rg->to - rg->from;
|
|
||||||
}
|
|
||||||
|
|
||||||
out:
|
|
||||||
spin_unlock(&resv->lock);
|
|
||||||
/* We already know we raced and no longer need the new region */
|
|
||||||
kfree(nrg);
|
|
||||||
return chg;
|
|
||||||
out_nrg:
|
|
||||||
spin_unlock(&resv->lock);
|
spin_unlock(&resv->lock);
|
||||||
return chg;
|
return chg;
|
||||||
}
|
}
|
||||||
|
@ -1069,85 +1023,12 @@ static void free_gigantic_page(struct page *page, unsigned int order)
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_CONTIG_ALLOC
|
#ifdef CONFIG_CONTIG_ALLOC
|
||||||
static int __alloc_gigantic_page(unsigned long start_pfn,
|
|
||||||
unsigned long nr_pages, gfp_t gfp_mask)
|
|
||||||
{
|
|
||||||
unsigned long end_pfn = start_pfn + nr_pages;
|
|
||||||
return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
|
|
||||||
gfp_mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool pfn_range_valid_gigantic(struct zone *z,
|
|
||||||
unsigned long start_pfn, unsigned long nr_pages)
|
|
||||||
{
|
|
||||||
unsigned long i, end_pfn = start_pfn + nr_pages;
|
|
||||||
struct page *page;
|
|
||||||
|
|
||||||
for (i = start_pfn; i < end_pfn; i++) {
|
|
||||||
page = pfn_to_online_page(i);
|
|
||||||
if (!page)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (page_zone(page) != z)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (PageReserved(page))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (page_count(page) > 0)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (PageHuge(page))
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool zone_spans_last_pfn(const struct zone *zone,
|
|
||||||
unsigned long start_pfn, unsigned long nr_pages)
|
|
||||||
{
|
|
||||||
unsigned long last_pfn = start_pfn + nr_pages - 1;
|
|
||||||
return zone_spans_pfn(zone, last_pfn);
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
|
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
|
||||||
int nid, nodemask_t *nodemask)
|
int nid, nodemask_t *nodemask)
|
||||||
{
|
{
|
||||||
unsigned int order = huge_page_order(h);
|
unsigned long nr_pages = 1UL << huge_page_order(h);
|
||||||
unsigned long nr_pages = 1 << order;
|
|
||||||
unsigned long ret, pfn, flags;
|
|
||||||
struct zonelist *zonelist;
|
|
||||||
struct zone *zone;
|
|
||||||
struct zoneref *z;
|
|
||||||
|
|
||||||
zonelist = node_zonelist(nid, gfp_mask);
|
return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
|
||||||
for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
|
|
||||||
spin_lock_irqsave(&zone->lock, flags);
|
|
||||||
|
|
||||||
pfn = ALIGN(zone->zone_start_pfn, nr_pages);
|
|
||||||
while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
|
|
||||||
if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
|
|
||||||
/*
|
|
||||||
* We release the zone lock here because
|
|
||||||
* alloc_contig_range() will also lock the zone
|
|
||||||
* at some point. If there's an allocation
|
|
||||||
* spinning on this lock, it may win the race
|
|
||||||
* and cause alloc_contig_range() to fail...
|
|
||||||
*/
|
|
||||||
spin_unlock_irqrestore(&zone->lock, flags);
|
|
||||||
ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
|
|
||||||
if (!ret)
|
|
||||||
return pfn_to_page(pfn);
|
|
||||||
spin_lock_irqsave(&zone->lock, flags);
|
|
||||||
}
|
|
||||||
pfn += nr_pages;
|
|
||||||
}
|
|
||||||
|
|
||||||
spin_unlock_irqrestore(&zone->lock, flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
|
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
|
||||||
|
@ -3915,7 +3796,7 @@ retry:
|
||||||
* handling userfault. Reacquire after handling
|
* handling userfault. Reacquire after handling
|
||||||
* fault to make calling code simpler.
|
* fault to make calling code simpler.
|
||||||
*/
|
*/
|
||||||
hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
|
hash = hugetlb_fault_mutex_hash(mapping, idx);
|
||||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||||
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
|
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
|
||||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
@ -4042,8 +3923,7 @@ backout_unlocked:
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_SMP
|
#ifdef CONFIG_SMP
|
||||||
u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
|
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
|
||||||
pgoff_t idx, unsigned long address)
|
|
||||||
{
|
{
|
||||||
unsigned long key[2];
|
unsigned long key[2];
|
||||||
u32 hash;
|
u32 hash;
|
||||||
|
@ -4051,7 +3931,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
|
||||||
key[0] = (unsigned long) mapping;
|
key[0] = (unsigned long) mapping;
|
||||||
key[1] = idx;
|
key[1] = idx;
|
||||||
|
|
||||||
hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
|
hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
|
||||||
|
|
||||||
return hash & (num_fault_mutexes - 1);
|
return hash & (num_fault_mutexes - 1);
|
||||||
}
|
}
|
||||||
|
@ -4060,8 +3940,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
|
||||||
* For uniprocesor systems we always use a single mutex, so just
|
* For uniprocesor systems we always use a single mutex, so just
|
||||||
* return 0 and avoid the hashing overhead.
|
* return 0 and avoid the hashing overhead.
|
||||||
*/
|
*/
|
||||||
u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
|
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
|
||||||
pgoff_t idx, unsigned long address)
|
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -4105,7 +3984,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
* get spurious allocation failures if two CPUs race to instantiate
|
* get spurious allocation failures if two CPUs race to instantiate
|
||||||
* the same page in the page cache.
|
* the same page in the page cache.
|
||||||
*/
|
*/
|
||||||
hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
|
hash = hugetlb_fault_mutex_hash(mapping, idx);
|
||||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
|
||||||
entry = huge_ptep_get(ptep);
|
entry = huge_ptep_get(ptep);
|
||||||
|
@ -4459,6 +4338,21 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If subpage information not requested, update counters
|
||||||
|
* and skip the same_page loop below.
|
||||||
|
*/
|
||||||
|
if (!pages && !vmas && !pfn_offset &&
|
||||||
|
(vaddr + huge_page_size(h) < vma->vm_end) &&
|
||||||
|
(remainder >= pages_per_huge_page(h))) {
|
||||||
|
vaddr += huge_page_size(h);
|
||||||
|
remainder -= pages_per_huge_page(h);
|
||||||
|
i += pages_per_huge_page(h);
|
||||||
|
spin_unlock(ptl);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
same_page:
|
same_page:
|
||||||
if (pages) {
|
if (pages) {
|
||||||
pages[i] = mem_map_offset(page, pfn_offset);
|
pages[i] = mem_map_offset(page, pfn_offset);
|
||||||
|
@ -4842,7 +4736,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
|
||||||
if (!vma_shareable(vma, addr))
|
if (!vma_shareable(vma, addr))
|
||||||
return (pte_t *)pmd_alloc(mm, pud, addr);
|
return (pte_t *)pmd_alloc(mm, pud, addr);
|
||||||
|
|
||||||
i_mmap_lock_write(mapping);
|
i_mmap_lock_read(mapping);
|
||||||
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
|
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
|
||||||
if (svma == vma)
|
if (svma == vma)
|
||||||
continue;
|
continue;
|
||||||
|
@ -4872,7 +4766,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
|
||||||
spin_unlock(ptl);
|
spin_unlock(ptl);
|
||||||
out:
|
out:
|
||||||
pte = (pte_t *)pmd_alloc(mm, pud, addr);
|
pte = (pte_t *)pmd_alloc(mm, pud, addr);
|
||||||
i_mmap_unlock_write(mapping);
|
i_mmap_unlock_read(mapping);
|
||||||
return pte;
|
return pte;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -67,8 +67,8 @@ static int hwpoison_unpoison(void *data, u64 val)
|
||||||
return unpoison_memory(val);
|
return unpoison_memory(val);
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
|
DEFINE_DEBUGFS_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
|
||||||
DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
|
DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
|
||||||
|
|
||||||
static void pfn_inject_exit(void)
|
static void pfn_inject_exit(void)
|
||||||
{
|
{
|
||||||
|
|
|
@ -165,6 +165,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
|
||||||
gfp_t gfp_flags);
|
gfp_t gfp_flags);
|
||||||
extern int user_min_free_kbytes;
|
extern int user_min_free_kbytes;
|
||||||
|
|
||||||
|
extern void zone_pcp_update(struct zone *zone);
|
||||||
|
extern void zone_pcp_reset(struct zone *zone);
|
||||||
|
|
||||||
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
|
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -290,7 +293,8 @@ static inline bool is_data_mapping(vm_flags_t flags)
|
||||||
|
|
||||||
/* mm/util.c */
|
/* mm/util.c */
|
||||||
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
|
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
struct vm_area_struct *prev, struct rb_node *rb_parent);
|
struct vm_area_struct *prev);
|
||||||
|
void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
|
||||||
|
|
||||||
#ifdef CONFIG_MMU
|
#ifdef CONFIG_MMU
|
||||||
extern long populate_vma_page_range(struct vm_area_struct *vma,
|
extern long populate_vma_page_range(struct vm_area_struct *vma,
|
||||||
|
@ -362,6 +366,27 @@ vma_address(struct page *page, struct vm_area_struct *vma)
|
||||||
return max(start, vma->vm_start);
|
return max(start, vma->vm_start);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
|
||||||
|
struct file *fpin)
|
||||||
|
{
|
||||||
|
int flags = vmf->flags;
|
||||||
|
|
||||||
|
if (fpin)
|
||||||
|
return fpin;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
|
||||||
|
* anything, so we only pin the file and drop the mmap_sem if only
|
||||||
|
* FAULT_FLAG_ALLOW_RETRY is set.
|
||||||
|
*/
|
||||||
|
if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
|
||||||
|
FAULT_FLAG_ALLOW_RETRY) {
|
||||||
|
fpin = get_file(vmf->vma->vm_file);
|
||||||
|
up_read(&vmf->vma->vm_mm->mmap_sem);
|
||||||
|
}
|
||||||
|
return fpin;
|
||||||
|
}
|
||||||
|
|
||||||
#else /* !CONFIG_MMU */
|
#else /* !CONFIG_MMU */
|
||||||
static inline void clear_page_mlock(struct page *page) { }
|
static inline void clear_page_mlock(struct page *page) { }
|
||||||
static inline void mlock_vma_page(struct page *page) { }
|
static inline void mlock_vma_page(struct page *page) { }
|
||||||
|
|
|
@ -36,6 +36,8 @@
|
||||||
#include <linux/bug.h>
|
#include <linux/bug.h>
|
||||||
#include <linux/uaccess.h>
|
#include <linux/uaccess.h>
|
||||||
|
|
||||||
|
#include <asm/tlbflush.h>
|
||||||
|
|
||||||
#include "kasan.h"
|
#include "kasan.h"
|
||||||
#include "../slab.h"
|
#include "../slab.h"
|
||||||
|
|
||||||
|
@ -590,6 +592,7 @@ void kasan_kfree_large(void *ptr, unsigned long ip)
|
||||||
/* The object will be poisoned by page_alloc. */
|
/* The object will be poisoned by page_alloc. */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef CONFIG_KASAN_VMALLOC
|
||||||
int kasan_module_alloc(void *addr, size_t size)
|
int kasan_module_alloc(void *addr, size_t size)
|
||||||
{
|
{
|
||||||
void *ret;
|
void *ret;
|
||||||
|
@ -625,6 +628,7 @@ void kasan_free_shadow(const struct vm_struct *vm)
|
||||||
if (vm->flags & VM_KASAN)
|
if (vm->flags & VM_KASAN)
|
||||||
vfree(kasan_mem_to_shadow(vm->addr));
|
vfree(kasan_mem_to_shadow(vm->addr));
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip);
|
extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip);
|
||||||
|
|
||||||
|
@ -744,3 +748,232 @@ static int __init kasan_memhotplug_init(void)
|
||||||
|
|
||||||
core_initcall(kasan_memhotplug_init);
|
core_initcall(kasan_memhotplug_init);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef CONFIG_KASAN_VMALLOC
|
||||||
|
static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
|
||||||
|
void *unused)
|
||||||
|
{
|
||||||
|
unsigned long page;
|
||||||
|
pte_t pte;
|
||||||
|
|
||||||
|
if (likely(!pte_none(*ptep)))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
page = __get_free_page(GFP_KERNEL);
|
||||||
|
if (!page)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
|
||||||
|
pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
|
||||||
|
|
||||||
|
spin_lock(&init_mm.page_table_lock);
|
||||||
|
if (likely(pte_none(*ptep))) {
|
||||||
|
set_pte_at(&init_mm, addr, ptep, pte);
|
||||||
|
page = 0;
|
||||||
|
}
|
||||||
|
spin_unlock(&init_mm.page_table_lock);
|
||||||
|
if (page)
|
||||||
|
free_page(page);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int kasan_populate_vmalloc(unsigned long requested_size, struct vm_struct *area)
|
||||||
|
{
|
||||||
|
unsigned long shadow_start, shadow_end;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
shadow_start = (unsigned long)kasan_mem_to_shadow(area->addr);
|
||||||
|
shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE);
|
||||||
|
shadow_end = (unsigned long)kasan_mem_to_shadow(area->addr +
|
||||||
|
area->size);
|
||||||
|
shadow_end = ALIGN(shadow_end, PAGE_SIZE);
|
||||||
|
|
||||||
|
ret = apply_to_page_range(&init_mm, shadow_start,
|
||||||
|
shadow_end - shadow_start,
|
||||||
|
kasan_populate_vmalloc_pte, NULL);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
flush_cache_vmap(shadow_start, shadow_end);
|
||||||
|
|
||||||
|
kasan_unpoison_shadow(area->addr, requested_size);
|
||||||
|
|
||||||
|
area->flags |= VM_KASAN;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We need to be careful about inter-cpu effects here. Consider:
|
||||||
|
*
|
||||||
|
* CPU#0 CPU#1
|
||||||
|
* WRITE_ONCE(p, vmalloc(100)); while (x = READ_ONCE(p)) ;
|
||||||
|
* p[99] = 1;
|
||||||
|
*
|
||||||
|
* With compiler instrumentation, that ends up looking like this:
|
||||||
|
*
|
||||||
|
* CPU#0 CPU#1
|
||||||
|
* // vmalloc() allocates memory
|
||||||
|
* // let a = area->addr
|
||||||
|
* // we reach kasan_populate_vmalloc
|
||||||
|
* // and call kasan_unpoison_shadow:
|
||||||
|
* STORE shadow(a), unpoison_val
|
||||||
|
* ...
|
||||||
|
* STORE shadow(a+99), unpoison_val x = LOAD p
|
||||||
|
* // rest of vmalloc process <data dependency>
|
||||||
|
* STORE p, a LOAD shadow(x+99)
|
||||||
|
*
|
||||||
|
* If there is no barrier between the end of unpoisioning the shadow
|
||||||
|
* and the store of the result to p, the stores could be committed
|
||||||
|
* in a different order by CPU#0, and CPU#1 could erroneously observe
|
||||||
|
* poison in the shadow.
|
||||||
|
*
|
||||||
|
* We need some sort of barrier between the stores.
|
||||||
|
*
|
||||||
|
* In the vmalloc() case, this is provided by a smp_wmb() in
|
||||||
|
* clear_vm_uninitialized_flag(). In the per-cpu allocator and in
|
||||||
|
* get_vm_area() and friends, the caller gets shadow allocated but
|
||||||
|
* doesn't have any pages mapped into the virtual address space that
|
||||||
|
* has been reserved. Mapping those pages in will involve taking and
|
||||||
|
* releasing a page-table lock, which will provide the barrier.
|
||||||
|
*/
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Poison the shadow for a vmalloc region. Called as part of the
|
||||||
|
* freeing process at the time the region is freed.
|
||||||
|
*/
|
||||||
|
void kasan_poison_vmalloc(void *start, unsigned long size)
|
||||||
|
{
|
||||||
|
size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
|
||||||
|
kasan_poison_shadow(start, size, KASAN_VMALLOC_INVALID);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
|
||||||
|
void *unused)
|
||||||
|
{
|
||||||
|
unsigned long page;
|
||||||
|
|
||||||
|
page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT);
|
||||||
|
|
||||||
|
spin_lock(&init_mm.page_table_lock);
|
||||||
|
|
||||||
|
if (likely(!pte_none(*ptep))) {
|
||||||
|
pte_clear(&init_mm, addr, ptep);
|
||||||
|
free_page(page);
|
||||||
|
}
|
||||||
|
spin_unlock(&init_mm.page_table_lock);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Release the backing for the vmalloc region [start, end), which
|
||||||
|
* lies within the free region [free_region_start, free_region_end).
|
||||||
|
*
|
||||||
|
* This can be run lazily, long after the region was freed. It runs
|
||||||
|
* under vmap_area_lock, so it's not safe to interact with the vmalloc/vmap
|
||||||
|
* infrastructure.
|
||||||
|
*
|
||||||
|
* How does this work?
|
||||||
|
* -------------------
|
||||||
|
*
|
||||||
|
* We have a region that is page aligned, labelled as A.
|
||||||
|
* That might not map onto the shadow in a way that is page-aligned:
|
||||||
|
*
|
||||||
|
* start end
|
||||||
|
* v v
|
||||||
|
* |????????|????????|AAAAAAAA|AA....AA|AAAAAAAA|????????| < vmalloc
|
||||||
|
* -------- -------- -------- -------- --------
|
||||||
|
* | | | | |
|
||||||
|
* | | | /-------/ |
|
||||||
|
* \-------\|/------/ |/---------------/
|
||||||
|
* ||| ||
|
||||||
|
* |??AAAAAA|AAAAAAAA|AA??????| < shadow
|
||||||
|
* (1) (2) (3)
|
||||||
|
*
|
||||||
|
* First we align the start upwards and the end downwards, so that the
|
||||||
|
* shadow of the region aligns with shadow page boundaries. In the
|
||||||
|
* example, this gives us the shadow page (2). This is the shadow entirely
|
||||||
|
* covered by this allocation.
|
||||||
|
*
|
||||||
|
* Then we have the tricky bits. We want to know if we can free the
|
||||||
|
* partially covered shadow pages - (1) and (3) in the example. For this,
|
||||||
|
* we are given the start and end of the free region that contains this
|
||||||
|
* allocation. Extending our previous example, we could have:
|
||||||
|
*
|
||||||
|
* free_region_start free_region_end
|
||||||
|
* | start end |
|
||||||
|
* v v v v
|
||||||
|
* |FFFFFFFF|FFFFFFFF|AAAAAAAA|AA....AA|AAAAAAAA|FFFFFFFF| < vmalloc
|
||||||
|
* -------- -------- -------- -------- --------
|
||||||
|
* | | | | |
|
||||||
|
* | | | /-------/ |
|
||||||
|
* \-------\|/------/ |/---------------/
|
||||||
|
* ||| ||
|
||||||
|
* |FFAAAAAA|AAAAAAAA|AAF?????| < shadow
|
||||||
|
* (1) (2) (3)
|
||||||
|
*
|
||||||
|
* Once again, we align the start of the free region up, and the end of
|
||||||
|
* the free region down so that the shadow is page aligned. So we can free
|
||||||
|
* page (1) - we know no allocation currently uses anything in that page,
|
||||||
|
* because all of it is in the vmalloc free region. But we cannot free
|
||||||
|
* page (3), because we can't be sure that the rest of it is unused.
|
||||||
|
*
|
||||||
|
* We only consider pages that contain part of the original region for
|
||||||
|
* freeing: we don't try to free other pages from the free region or we'd
|
||||||
|
* end up trying to free huge chunks of virtual address space.
|
||||||
|
*
|
||||||
|
* Concurrency
|
||||||
|
* -----------
|
||||||
|
*
|
||||||
|
* How do we know that we're not freeing a page that is simultaneously
|
||||||
|
* being used for a fresh allocation in kasan_populate_vmalloc(_pte)?
|
||||||
|
*
|
||||||
|
* We _can_ have kasan_release_vmalloc and kasan_populate_vmalloc running
|
||||||
|
* at the same time. While we run under free_vmap_area_lock, the population
|
||||||
|
* code does not.
|
||||||
|
*
|
||||||
|
* free_vmap_area_lock instead operates to ensure that the larger range
|
||||||
|
* [free_region_start, free_region_end) is safe: because __alloc_vmap_area and
|
||||||
|
* the per-cpu region-finding algorithm both run under free_vmap_area_lock,
|
||||||
|
* no space identified as free will become used while we are running. This
|
||||||
|
* means that so long as we are careful with alignment and only free shadow
|
||||||
|
* pages entirely covered by the free region, we will not run in to any
|
||||||
|
* trouble - any simultaneous allocations will be for disjoint regions.
|
||||||
|
*/
|
||||||
|
void kasan_release_vmalloc(unsigned long start, unsigned long end,
|
||||||
|
unsigned long free_region_start,
|
||||||
|
unsigned long free_region_end)
|
||||||
|
{
|
||||||
|
void *shadow_start, *shadow_end;
|
||||||
|
unsigned long region_start, region_end;
|
||||||
|
|
||||||
|
region_start = ALIGN(start, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
|
||||||
|
region_end = ALIGN_DOWN(end, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
|
||||||
|
|
||||||
|
free_region_start = ALIGN(free_region_start,
|
||||||
|
PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
|
||||||
|
|
||||||
|
if (start != region_start &&
|
||||||
|
free_region_start < region_start)
|
||||||
|
region_start -= PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE;
|
||||||
|
|
||||||
|
free_region_end = ALIGN_DOWN(free_region_end,
|
||||||
|
PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
|
||||||
|
|
||||||
|
if (end != region_end &&
|
||||||
|
free_region_end > region_end)
|
||||||
|
region_end += PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE;
|
||||||
|
|
||||||
|
shadow_start = kasan_mem_to_shadow((void *)region_start);
|
||||||
|
shadow_end = kasan_mem_to_shadow((void *)region_end);
|
||||||
|
|
||||||
|
if (shadow_end > shadow_start) {
|
||||||
|
apply_to_page_range(&init_mm, (unsigned long)shadow_start,
|
||||||
|
(unsigned long)(shadow_end - shadow_start),
|
||||||
|
kasan_depopulate_vmalloc_pte, NULL);
|
||||||
|
flush_tlb_kernel_range((unsigned long)shadow_start,
|
||||||
|
(unsigned long)shadow_end);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
|
@ -86,6 +86,9 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info)
|
||||||
case KASAN_ALLOCA_RIGHT:
|
case KASAN_ALLOCA_RIGHT:
|
||||||
bug_type = "alloca-out-of-bounds";
|
bug_type = "alloca-out-of-bounds";
|
||||||
break;
|
break;
|
||||||
|
case KASAN_VMALLOC_INVALID:
|
||||||
|
bug_type = "vmalloc-out-of-bounds";
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
return bug_type;
|
return bug_type;
|
||||||
|
|
|
@ -25,6 +25,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */
|
#define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */
|
||||||
|
#define KASAN_VMALLOC_INVALID 0xF9 /* unallocated space in vmapped page */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Stack redzone shadow values
|
* Stack redzone shadow values
|
||||||
|
|
|
@ -1602,6 +1602,24 @@ static void collapse_file(struct mm_struct *mm,
|
||||||
result = SCAN_FAIL;
|
result = SCAN_FAIL;
|
||||||
goto xa_unlocked;
|
goto xa_unlocked;
|
||||||
}
|
}
|
||||||
|
} else if (PageDirty(page)) {
|
||||||
|
/*
|
||||||
|
* khugepaged only works on read-only fd,
|
||||||
|
* so this page is dirty because it hasn't
|
||||||
|
* been flushed since first write. There
|
||||||
|
* won't be new dirty pages.
|
||||||
|
*
|
||||||
|
* Trigger async flush here and hope the
|
||||||
|
* writeback is done when khugepaged
|
||||||
|
* revisits this page.
|
||||||
|
*
|
||||||
|
* This is a one-off situation. We are not
|
||||||
|
* forcing writeback in loop.
|
||||||
|
*/
|
||||||
|
xas_unlock_irq(&xas);
|
||||||
|
filemap_flush(mapping);
|
||||||
|
result = SCAN_FAIL;
|
||||||
|
goto xa_unlocked;
|
||||||
} else if (trylock_page(page)) {
|
} else if (trylock_page(page)) {
|
||||||
get_page(page);
|
get_page(page);
|
||||||
xas_unlock_irq(&xas);
|
xas_unlock_irq(&xas);
|
||||||
|
|
14
mm/madvise.c
14
mm/madvise.c
|
@ -864,13 +864,13 @@ static int madvise_inject_error(int behavior,
|
||||||
{
|
{
|
||||||
struct page *page;
|
struct page *page;
|
||||||
struct zone *zone;
|
struct zone *zone;
|
||||||
unsigned int order;
|
unsigned long size;
|
||||||
|
|
||||||
if (!capable(CAP_SYS_ADMIN))
|
if (!capable(CAP_SYS_ADMIN))
|
||||||
return -EPERM;
|
return -EPERM;
|
||||||
|
|
||||||
|
|
||||||
for (; start < end; start += PAGE_SIZE << order) {
|
for (; start < end; start += size) {
|
||||||
unsigned long pfn;
|
unsigned long pfn;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
|
@ -882,9 +882,9 @@ static int madvise_inject_error(int behavior,
|
||||||
/*
|
/*
|
||||||
* When soft offlining hugepages, after migrating the page
|
* When soft offlining hugepages, after migrating the page
|
||||||
* we dissolve it, therefore in the second loop "page" will
|
* we dissolve it, therefore in the second loop "page" will
|
||||||
* no longer be a compound page, and order will be 0.
|
* no longer be a compound page.
|
||||||
*/
|
*/
|
||||||
order = compound_order(compound_head(page));
|
size = page_size(compound_head(page));
|
||||||
|
|
||||||
if (PageHWPoison(page)) {
|
if (PageHWPoison(page)) {
|
||||||
put_page(page);
|
put_page(page);
|
||||||
|
@ -895,7 +895,7 @@ static int madvise_inject_error(int behavior,
|
||||||
pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
|
pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
|
||||||
pfn, start);
|
pfn, start);
|
||||||
|
|
||||||
ret = soft_offline_page(page, MF_COUNT_INCREASED);
|
ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
|
||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
return ret;
|
||||||
continue;
|
continue;
|
||||||
|
@ -1059,9 +1059,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
|
||||||
if (!madvise_behavior_valid(behavior))
|
if (!madvise_behavior_valid(behavior))
|
||||||
return error;
|
return error;
|
||||||
|
|
||||||
if (start & ~PAGE_MASK)
|
if (!PAGE_ALIGNED(start))
|
||||||
return error;
|
return error;
|
||||||
len = (len_in + ~PAGE_MASK) & PAGE_MASK;
|
len = PAGE_ALIGN(len_in);
|
||||||
|
|
||||||
/* Check to see whether len was rounded up from small -ve to zero */
|
/* Check to see whether len was rounded up from small -ve to zero */
|
||||||
if (len_in && !len)
|
if (len_in && !len)
|
||||||
|
|
111
mm/memblock.c
111
mm/memblock.c
|
@ -57,42 +57,38 @@
|
||||||
* at build time. The region arrays for the "memory" and "reserved"
|
* at build time. The region arrays for the "memory" and "reserved"
|
||||||
* types are initially sized to %INIT_MEMBLOCK_REGIONS and for the
|
* types are initially sized to %INIT_MEMBLOCK_REGIONS and for the
|
||||||
* "physmap" type to %INIT_PHYSMEM_REGIONS.
|
* "physmap" type to %INIT_PHYSMEM_REGIONS.
|
||||||
* The :c:func:`memblock_allow_resize` enables automatic resizing of
|
* The memblock_allow_resize() enables automatic resizing of the region
|
||||||
* the region arrays during addition of new regions. This feature
|
* arrays during addition of new regions. This feature should be used
|
||||||
* should be used with care so that memory allocated for the region
|
* with care so that memory allocated for the region array will not
|
||||||
* array will not overlap with areas that should be reserved, for
|
* overlap with areas that should be reserved, for example initrd.
|
||||||
* example initrd.
|
|
||||||
*
|
*
|
||||||
* The early architecture setup should tell memblock what the physical
|
* The early architecture setup should tell memblock what the physical
|
||||||
* memory layout is by using :c:func:`memblock_add` or
|
* memory layout is by using memblock_add() or memblock_add_node()
|
||||||
* :c:func:`memblock_add_node` functions. The first function does not
|
* functions. The first function does not assign the region to a NUMA
|
||||||
* assign the region to a NUMA node and it is appropriate for UMA
|
* node and it is appropriate for UMA systems. Yet, it is possible to
|
||||||
* systems. Yet, it is possible to use it on NUMA systems as well and
|
* use it on NUMA systems as well and assign the region to a NUMA node
|
||||||
* assign the region to a NUMA node later in the setup process using
|
* later in the setup process using memblock_set_node(). The
|
||||||
* :c:func:`memblock_set_node`. The :c:func:`memblock_add_node`
|
* memblock_add_node() performs such an assignment directly.
|
||||||
* performs such an assignment directly.
|
|
||||||
*
|
*
|
||||||
* Once memblock is setup the memory can be allocated using one of the
|
* Once memblock is setup the memory can be allocated using one of the
|
||||||
* API variants:
|
* API variants:
|
||||||
*
|
*
|
||||||
* * :c:func:`memblock_phys_alloc*` - these functions return the
|
* * memblock_phys_alloc*() - these functions return the **physical**
|
||||||
* **physical** address of the allocated memory
|
* address of the allocated memory
|
||||||
* * :c:func:`memblock_alloc*` - these functions return the **virtual**
|
* * memblock_alloc*() - these functions return the **virtual** address
|
||||||
* address of the allocated memory.
|
* of the allocated memory.
|
||||||
*
|
*
|
||||||
* Note, that both API variants use implict assumptions about allowed
|
* Note, that both API variants use implict assumptions about allowed
|
||||||
* memory ranges and the fallback methods. Consult the documentation
|
* memory ranges and the fallback methods. Consult the documentation
|
||||||
* of :c:func:`memblock_alloc_internal` and
|
* of memblock_alloc_internal() and memblock_alloc_range_nid()
|
||||||
* :c:func:`memblock_alloc_range_nid` functions for more elaboarte
|
* functions for more elaborate description.
|
||||||
* description.
|
|
||||||
*
|
*
|
||||||
* As the system boot progresses, the architecture specific
|
* As the system boot progresses, the architecture specific mem_init()
|
||||||
* :c:func:`mem_init` function frees all the memory to the buddy page
|
* function frees all the memory to the buddy page allocator.
|
||||||
* allocator.
|
|
||||||
*
|
*
|
||||||
* Unless an architecure enables %CONFIG_ARCH_KEEP_MEMBLOCK, the
|
* Unless an architecture enables %CONFIG_ARCH_KEEP_MEMBLOCK, the
|
||||||
* memblock data structures will be discarded after the system
|
* memblock data structures will be discarded after the system
|
||||||
* initialization compltes.
|
* initialization completes.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef CONFIG_NEED_MULTIPLE_NODES
|
#ifndef CONFIG_NEED_MULTIPLE_NODES
|
||||||
|
@ -1323,12 +1319,13 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
|
||||||
* @start: the lower bound of the memory region to allocate (phys address)
|
* @start: the lower bound of the memory region to allocate (phys address)
|
||||||
* @end: the upper bound of the memory region to allocate (phys address)
|
* @end: the upper bound of the memory region to allocate (phys address)
|
||||||
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
|
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
|
||||||
|
* @exact_nid: control the allocation fall back to other nodes
|
||||||
*
|
*
|
||||||
* The allocation is performed from memory region limited by
|
* The allocation is performed from memory region limited by
|
||||||
* memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE.
|
* memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE.
|
||||||
*
|
*
|
||||||
* If the specified node can not hold the requested memory the
|
* If the specified node can not hold the requested memory and @exact_nid
|
||||||
* allocation falls back to any node in the system
|
* is false, the allocation falls back to any node in the system.
|
||||||
*
|
*
|
||||||
* For systems with memory mirroring, the allocation is attempted first
|
* For systems with memory mirroring, the allocation is attempted first
|
||||||
* from the regions with mirroring enabled and then retried from any
|
* from the regions with mirroring enabled and then retried from any
|
||||||
|
@ -1342,7 +1339,8 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
|
||||||
*/
|
*/
|
||||||
static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
|
static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
|
||||||
phys_addr_t align, phys_addr_t start,
|
phys_addr_t align, phys_addr_t start,
|
||||||
phys_addr_t end, int nid)
|
phys_addr_t end, int nid,
|
||||||
|
bool exact_nid)
|
||||||
{
|
{
|
||||||
enum memblock_flags flags = choose_memblock_flags();
|
enum memblock_flags flags = choose_memblock_flags();
|
||||||
phys_addr_t found;
|
phys_addr_t found;
|
||||||
|
@ -1362,7 +1360,7 @@ again:
|
||||||
if (found && !memblock_reserve(found, size))
|
if (found && !memblock_reserve(found, size))
|
||||||
goto done;
|
goto done;
|
||||||
|
|
||||||
if (nid != NUMA_NO_NODE) {
|
if (nid != NUMA_NO_NODE && !exact_nid) {
|
||||||
found = memblock_find_in_range_node(size, align, start,
|
found = memblock_find_in_range_node(size, align, start,
|
||||||
end, NUMA_NO_NODE,
|
end, NUMA_NO_NODE,
|
||||||
flags);
|
flags);
|
||||||
|
@ -1410,7 +1408,8 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
|
||||||
phys_addr_t start,
|
phys_addr_t start,
|
||||||
phys_addr_t end)
|
phys_addr_t end)
|
||||||
{
|
{
|
||||||
return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
|
return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
|
||||||
|
false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1429,7 +1428,7 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
|
||||||
phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
|
phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
|
||||||
{
|
{
|
||||||
return memblock_alloc_range_nid(size, align, 0,
|
return memblock_alloc_range_nid(size, align, 0,
|
||||||
MEMBLOCK_ALLOC_ACCESSIBLE, nid);
|
MEMBLOCK_ALLOC_ACCESSIBLE, nid, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1439,6 +1438,7 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali
|
||||||
* @min_addr: the lower bound of the memory region to allocate (phys address)
|
* @min_addr: the lower bound of the memory region to allocate (phys address)
|
||||||
* @max_addr: the upper bound of the memory region to allocate (phys address)
|
* @max_addr: the upper bound of the memory region to allocate (phys address)
|
||||||
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
|
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
|
||||||
|
* @exact_nid: control the allocation fall back to other nodes
|
||||||
*
|
*
|
||||||
* Allocates memory block using memblock_alloc_range_nid() and
|
* Allocates memory block using memblock_alloc_range_nid() and
|
||||||
* converts the returned physical address to virtual.
|
* converts the returned physical address to virtual.
|
||||||
|
@ -1454,7 +1454,7 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali
|
||||||
static void * __init memblock_alloc_internal(
|
static void * __init memblock_alloc_internal(
|
||||||
phys_addr_t size, phys_addr_t align,
|
phys_addr_t size, phys_addr_t align,
|
||||||
phys_addr_t min_addr, phys_addr_t max_addr,
|
phys_addr_t min_addr, phys_addr_t max_addr,
|
||||||
int nid)
|
int nid, bool exact_nid)
|
||||||
{
|
{
|
||||||
phys_addr_t alloc;
|
phys_addr_t alloc;
|
||||||
|
|
||||||
|
@ -1469,11 +1469,13 @@ static void * __init memblock_alloc_internal(
|
||||||
if (max_addr > memblock.current_limit)
|
if (max_addr > memblock.current_limit)
|
||||||
max_addr = memblock.current_limit;
|
max_addr = memblock.current_limit;
|
||||||
|
|
||||||
alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid);
|
alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid,
|
||||||
|
exact_nid);
|
||||||
|
|
||||||
/* retry allocation without lower limit */
|
/* retry allocation without lower limit */
|
||||||
if (!alloc && min_addr)
|
if (!alloc && min_addr)
|
||||||
alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid);
|
alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid,
|
||||||
|
exact_nid);
|
||||||
|
|
||||||
if (!alloc)
|
if (!alloc)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -1481,6 +1483,43 @@ static void * __init memblock_alloc_internal(
|
||||||
return phys_to_virt(alloc);
|
return phys_to_virt(alloc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* memblock_alloc_exact_nid_raw - allocate boot memory block on the exact node
|
||||||
|
* without zeroing memory
|
||||||
|
* @size: size of memory block to be allocated in bytes
|
||||||
|
* @align: alignment of the region and block's size
|
||||||
|
* @min_addr: the lower bound of the memory region from where the allocation
|
||||||
|
* is preferred (phys address)
|
||||||
|
* @max_addr: the upper bound of the memory region from where the allocation
|
||||||
|
* is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to
|
||||||
|
* allocate only from memory limited by memblock.current_limit value
|
||||||
|
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
|
||||||
|
*
|
||||||
|
* Public function, provides additional debug information (including caller
|
||||||
|
* info), if enabled. Does not zero allocated memory.
|
||||||
|
*
|
||||||
|
* Return:
|
||||||
|
* Virtual address of allocated memory block on success, NULL on failure.
|
||||||
|
*/
|
||||||
|
void * __init memblock_alloc_exact_nid_raw(
|
||||||
|
phys_addr_t size, phys_addr_t align,
|
||||||
|
phys_addr_t min_addr, phys_addr_t max_addr,
|
||||||
|
int nid)
|
||||||
|
{
|
||||||
|
void *ptr;
|
||||||
|
|
||||||
|
memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
|
||||||
|
__func__, (u64)size, (u64)align, nid, &min_addr,
|
||||||
|
&max_addr, (void *)_RET_IP_);
|
||||||
|
|
||||||
|
ptr = memblock_alloc_internal(size, align,
|
||||||
|
min_addr, max_addr, nid, true);
|
||||||
|
if (ptr && size > 0)
|
||||||
|
page_init_poison(ptr, size);
|
||||||
|
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* memblock_alloc_try_nid_raw - allocate boot memory block without zeroing
|
* memblock_alloc_try_nid_raw - allocate boot memory block without zeroing
|
||||||
* memory and without panicking
|
* memory and without panicking
|
||||||
|
@ -1512,7 +1551,7 @@ void * __init memblock_alloc_try_nid_raw(
|
||||||
&max_addr, (void *)_RET_IP_);
|
&max_addr, (void *)_RET_IP_);
|
||||||
|
|
||||||
ptr = memblock_alloc_internal(size, align,
|
ptr = memblock_alloc_internal(size, align,
|
||||||
min_addr, max_addr, nid);
|
min_addr, max_addr, nid, false);
|
||||||
if (ptr && size > 0)
|
if (ptr && size > 0)
|
||||||
page_init_poison(ptr, size);
|
page_init_poison(ptr, size);
|
||||||
|
|
||||||
|
@ -1547,7 +1586,7 @@ void * __init memblock_alloc_try_nid(
|
||||||
__func__, (u64)size, (u64)align, nid, &min_addr,
|
__func__, (u64)size, (u64)align, nid, &min_addr,
|
||||||
&max_addr, (void *)_RET_IP_);
|
&max_addr, (void *)_RET_IP_);
|
||||||
ptr = memblock_alloc_internal(size, align,
|
ptr = memblock_alloc_internal(size, align,
|
||||||
min_addr, max_addr, nid);
|
min_addr, max_addr, nid, false);
|
||||||
if (ptr)
|
if (ptr)
|
||||||
memset(ptr, 0, size);
|
memset(ptr, 0, size);
|
||||||
|
|
||||||
|
|
167
mm/memcontrol.c
167
mm/memcontrol.c
|
@ -108,7 +108,6 @@ static const char *const mem_cgroup_lru_names[] = {
|
||||||
|
|
||||||
#define THRESHOLDS_EVENTS_TARGET 128
|
#define THRESHOLDS_EVENTS_TARGET 128
|
||||||
#define SOFTLIMIT_EVENTS_TARGET 1024
|
#define SOFTLIMIT_EVENTS_TARGET 1024
|
||||||
#define NUMAINFO_EVENTS_TARGET 1024
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Cgroups above their limits are maintained in a RB-Tree, independent of
|
* Cgroups above their limits are maintained in a RB-Tree, independent of
|
||||||
|
@ -778,7 +777,7 @@ void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
|
||||||
if (!memcg || memcg == root_mem_cgroup) {
|
if (!memcg || memcg == root_mem_cgroup) {
|
||||||
__mod_node_page_state(pgdat, idx, val);
|
__mod_node_page_state(pgdat, idx, val);
|
||||||
} else {
|
} else {
|
||||||
lruvec = mem_cgroup_lruvec(pgdat, memcg);
|
lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||||
__mod_lruvec_state(lruvec, idx, val);
|
__mod_lruvec_state(lruvec, idx, val);
|
||||||
}
|
}
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
|
@ -877,9 +876,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
|
||||||
case MEM_CGROUP_TARGET_SOFTLIMIT:
|
case MEM_CGROUP_TARGET_SOFTLIMIT:
|
||||||
next = val + SOFTLIMIT_EVENTS_TARGET;
|
next = val + SOFTLIMIT_EVENTS_TARGET;
|
||||||
break;
|
break;
|
||||||
case MEM_CGROUP_TARGET_NUMAINFO:
|
|
||||||
next = val + NUMAINFO_EVENTS_TARGET;
|
|
||||||
break;
|
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -899,21 +895,12 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
|
||||||
if (unlikely(mem_cgroup_event_ratelimit(memcg,
|
if (unlikely(mem_cgroup_event_ratelimit(memcg,
|
||||||
MEM_CGROUP_TARGET_THRESH))) {
|
MEM_CGROUP_TARGET_THRESH))) {
|
||||||
bool do_softlimit;
|
bool do_softlimit;
|
||||||
bool do_numainfo __maybe_unused;
|
|
||||||
|
|
||||||
do_softlimit = mem_cgroup_event_ratelimit(memcg,
|
do_softlimit = mem_cgroup_event_ratelimit(memcg,
|
||||||
MEM_CGROUP_TARGET_SOFTLIMIT);
|
MEM_CGROUP_TARGET_SOFTLIMIT);
|
||||||
#if MAX_NUMNODES > 1
|
|
||||||
do_numainfo = mem_cgroup_event_ratelimit(memcg,
|
|
||||||
MEM_CGROUP_TARGET_NUMAINFO);
|
|
||||||
#endif
|
|
||||||
mem_cgroup_threshold(memcg);
|
mem_cgroup_threshold(memcg);
|
||||||
if (unlikely(do_softlimit))
|
if (unlikely(do_softlimit))
|
||||||
mem_cgroup_update_tree(memcg, page);
|
mem_cgroup_update_tree(memcg, page);
|
||||||
#if MAX_NUMNODES > 1
|
|
||||||
if (unlikely(do_numainfo))
|
|
||||||
atomic_inc(&memcg->numainfo_events);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1052,7 +1039,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
|
||||||
struct mem_cgroup_per_node *mz;
|
struct mem_cgroup_per_node *mz;
|
||||||
|
|
||||||
mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
|
mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
|
||||||
iter = &mz->iter[reclaim->priority];
|
iter = &mz->iter;
|
||||||
|
|
||||||
if (prev && reclaim->generation != iter->generation)
|
if (prev && reclaim->generation != iter->generation)
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
|
@ -1152,15 +1139,11 @@ static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
|
||||||
struct mem_cgroup_reclaim_iter *iter;
|
struct mem_cgroup_reclaim_iter *iter;
|
||||||
struct mem_cgroup_per_node *mz;
|
struct mem_cgroup_per_node *mz;
|
||||||
int nid;
|
int nid;
|
||||||
int i;
|
|
||||||
|
|
||||||
for_each_node(nid) {
|
for_each_node(nid) {
|
||||||
mz = mem_cgroup_nodeinfo(from, nid);
|
mz = mem_cgroup_nodeinfo(from, nid);
|
||||||
for (i = 0; i <= DEF_PRIORITY; i++) {
|
iter = &mz->iter;
|
||||||
iter = &mz->iter[i];
|
cmpxchg(&iter->position, dead_memcg, NULL);
|
||||||
cmpxchg(&iter->position,
|
|
||||||
dead_memcg, NULL);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1238,7 +1221,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
|
||||||
struct lruvec *lruvec;
|
struct lruvec *lruvec;
|
||||||
|
|
||||||
if (mem_cgroup_disabled()) {
|
if (mem_cgroup_disabled()) {
|
||||||
lruvec = &pgdat->lruvec;
|
lruvec = &pgdat->__lruvec;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1595,104 +1578,6 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if MAX_NUMNODES > 1
|
|
||||||
|
|
||||||
/**
|
|
||||||
* test_mem_cgroup_node_reclaimable
|
|
||||||
* @memcg: the target memcg
|
|
||||||
* @nid: the node ID to be checked.
|
|
||||||
* @noswap : specify true here if the user wants flle only information.
|
|
||||||
*
|
|
||||||
* This function returns whether the specified memcg contains any
|
|
||||||
* reclaimable pages on a node. Returns true if there are any reclaimable
|
|
||||||
* pages in the node.
|
|
||||||
*/
|
|
||||||
static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
|
|
||||||
int nid, bool noswap)
|
|
||||||
{
|
|
||||||
struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
|
|
||||||
|
|
||||||
if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
|
|
||||||
lruvec_page_state(lruvec, NR_ACTIVE_FILE))
|
|
||||||
return true;
|
|
||||||
if (noswap || !total_swap_pages)
|
|
||||||
return false;
|
|
||||||
if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
|
|
||||||
lruvec_page_state(lruvec, NR_ACTIVE_ANON))
|
|
||||||
return true;
|
|
||||||
return false;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Always updating the nodemask is not very good - even if we have an empty
|
|
||||||
* list or the wrong list here, we can start from some node and traverse all
|
|
||||||
* nodes based on the zonelist. So update the list loosely once per 10 secs.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
|
|
||||||
{
|
|
||||||
int nid;
|
|
||||||
/*
|
|
||||||
* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
|
|
||||||
* pagein/pageout changes since the last update.
|
|
||||||
*/
|
|
||||||
if (!atomic_read(&memcg->numainfo_events))
|
|
||||||
return;
|
|
||||||
if (atomic_inc_return(&memcg->numainfo_updating) > 1)
|
|
||||||
return;
|
|
||||||
|
|
||||||
/* make a nodemask where this memcg uses memory from */
|
|
||||||
memcg->scan_nodes = node_states[N_MEMORY];
|
|
||||||
|
|
||||||
for_each_node_mask(nid, node_states[N_MEMORY]) {
|
|
||||||
|
|
||||||
if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
|
|
||||||
node_clear(nid, memcg->scan_nodes);
|
|
||||||
}
|
|
||||||
|
|
||||||
atomic_set(&memcg->numainfo_events, 0);
|
|
||||||
atomic_set(&memcg->numainfo_updating, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Selecting a node where we start reclaim from. Because what we need is just
|
|
||||||
* reducing usage counter, start from anywhere is O,K. Considering
|
|
||||||
* memory reclaim from current node, there are pros. and cons.
|
|
||||||
*
|
|
||||||
* Freeing memory from current node means freeing memory from a node which
|
|
||||||
* we'll use or we've used. So, it may make LRU bad. And if several threads
|
|
||||||
* hit limits, it will see a contention on a node. But freeing from remote
|
|
||||||
* node means more costs for memory reclaim because of memory latency.
|
|
||||||
*
|
|
||||||
* Now, we use round-robin. Better algorithm is welcomed.
|
|
||||||
*/
|
|
||||||
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
|
|
||||||
{
|
|
||||||
int node;
|
|
||||||
|
|
||||||
mem_cgroup_may_update_nodemask(memcg);
|
|
||||||
node = memcg->last_scanned_node;
|
|
||||||
|
|
||||||
node = next_node_in(node, memcg->scan_nodes);
|
|
||||||
/*
|
|
||||||
* mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
|
|
||||||
* last time it really checked all the LRUs due to rate limiting.
|
|
||||||
* Fallback to the current node in that case for simplicity.
|
|
||||||
*/
|
|
||||||
if (unlikely(node == MAX_NUMNODES))
|
|
||||||
node = numa_node_id();
|
|
||||||
|
|
||||||
memcg->last_scanned_node = node;
|
|
||||||
return node;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
|
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
|
||||||
pg_data_t *pgdat,
|
pg_data_t *pgdat,
|
||||||
gfp_t gfp_mask,
|
gfp_t gfp_mask,
|
||||||
|
@ -1705,7 +1590,6 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
|
||||||
unsigned long nr_scanned;
|
unsigned long nr_scanned;
|
||||||
struct mem_cgroup_reclaim_cookie reclaim = {
|
struct mem_cgroup_reclaim_cookie reclaim = {
|
||||||
.pgdat = pgdat,
|
.pgdat = pgdat,
|
||||||
.priority = 0,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
excess = soft_limit_excess(root_memcg);
|
excess = soft_limit_excess(root_memcg);
|
||||||
|
@ -3750,7 +3634,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
|
||||||
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
|
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
|
||||||
int nid, unsigned int lru_mask)
|
int nid, unsigned int lru_mask)
|
||||||
{
|
{
|
||||||
struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
|
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
|
||||||
unsigned long nr = 0;
|
unsigned long nr = 0;
|
||||||
enum lru_list lru;
|
enum lru_list lru;
|
||||||
|
|
||||||
|
@ -5078,7 +4962,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
|
||||||
goto fail;
|
goto fail;
|
||||||
|
|
||||||
INIT_WORK(&memcg->high_work, high_work_func);
|
INIT_WORK(&memcg->high_work, high_work_func);
|
||||||
memcg->last_scanned_node = MAX_NUMNODES;
|
|
||||||
INIT_LIST_HEAD(&memcg->oom_notify);
|
INIT_LIST_HEAD(&memcg->oom_notify);
|
||||||
mutex_init(&memcg->thresholds_lock);
|
mutex_init(&memcg->thresholds_lock);
|
||||||
spin_lock_init(&memcg->move_lock);
|
spin_lock_init(&memcg->move_lock);
|
||||||
|
@ -5455,8 +5338,8 @@ static int mem_cgroup_move_account(struct page *page,
|
||||||
anon = PageAnon(page);
|
anon = PageAnon(page);
|
||||||
|
|
||||||
pgdat = page_pgdat(page);
|
pgdat = page_pgdat(page);
|
||||||
from_vec = mem_cgroup_lruvec(pgdat, from);
|
from_vec = mem_cgroup_lruvec(from, pgdat);
|
||||||
to_vec = mem_cgroup_lruvec(pgdat, to);
|
to_vec = mem_cgroup_lruvec(to, pgdat);
|
||||||
|
|
||||||
spin_lock_irqsave(&from->move_lock, flags);
|
spin_lock_irqsave(&from->move_lock, flags);
|
||||||
|
|
||||||
|
@ -6096,7 +5979,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
|
||||||
char *buf, size_t nbytes, loff_t off)
|
char *buf, size_t nbytes, loff_t off)
|
||||||
{
|
{
|
||||||
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
|
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
|
||||||
unsigned long nr_pages;
|
unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
|
||||||
|
bool drained = false;
|
||||||
unsigned long high;
|
unsigned long high;
|
||||||
int err;
|
int err;
|
||||||
|
|
||||||
|
@ -6107,12 +5991,29 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
|
||||||
|
|
||||||
memcg->high = high;
|
memcg->high = high;
|
||||||
|
|
||||||
nr_pages = page_counter_read(&memcg->memory);
|
for (;;) {
|
||||||
if (nr_pages > high)
|
unsigned long nr_pages = page_counter_read(&memcg->memory);
|
||||||
try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
|
unsigned long reclaimed;
|
||||||
GFP_KERNEL, true);
|
|
||||||
|
if (nr_pages <= high)
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (signal_pending(current))
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (!drained) {
|
||||||
|
drain_all_stock(memcg);
|
||||||
|
drained = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
|
||||||
|
GFP_KERNEL, true);
|
||||||
|
|
||||||
|
if (!reclaimed && !nr_retries--)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
memcg_wb_domain_size_changed(memcg);
|
|
||||||
return nbytes;
|
return nbytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6144,10 +6045,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
|
||||||
if (nr_pages <= max)
|
if (nr_pages <= max)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
if (signal_pending(current)) {
|
if (signal_pending(current))
|
||||||
err = -EINTR;
|
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
|
|
||||||
if (!drained) {
|
if (!drained) {
|
||||||
drain_all_stock(memcg);
|
drain_all_stock(memcg);
|
||||||
|
|
|
@ -303,30 +303,24 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
|
||||||
/*
|
/*
|
||||||
* Schedule a process for later kill.
|
* Schedule a process for later kill.
|
||||||
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
|
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
|
||||||
* TBD would GFP_NOIO be enough?
|
|
||||||
*/
|
*/
|
||||||
static void add_to_kill(struct task_struct *tsk, struct page *p,
|
static void add_to_kill(struct task_struct *tsk, struct page *p,
|
||||||
struct vm_area_struct *vma,
|
struct vm_area_struct *vma,
|
||||||
struct list_head *to_kill,
|
struct list_head *to_kill)
|
||||||
struct to_kill **tkc)
|
|
||||||
{
|
{
|
||||||
struct to_kill *tk;
|
struct to_kill *tk;
|
||||||
|
|
||||||
if (*tkc) {
|
tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
|
||||||
tk = *tkc;
|
if (!tk) {
|
||||||
*tkc = NULL;
|
pr_err("Memory failure: Out of memory while machine check handling\n");
|
||||||
} else {
|
return;
|
||||||
tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
|
|
||||||
if (!tk) {
|
|
||||||
pr_err("Memory failure: Out of memory while machine check handling\n");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
tk->addr = page_address_in_vma(p, vma);
|
tk->addr = page_address_in_vma(p, vma);
|
||||||
if (is_zone_device_page(p))
|
if (is_zone_device_page(p))
|
||||||
tk->size_shift = dev_pagemap_mapping_shift(p, vma);
|
tk->size_shift = dev_pagemap_mapping_shift(p, vma);
|
||||||
else
|
else
|
||||||
tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
|
tk->size_shift = page_shift(compound_head(p));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Send SIGKILL if "tk->addr == -EFAULT". Also, as
|
* Send SIGKILL if "tk->addr == -EFAULT". Also, as
|
||||||
|
@ -345,6 +339,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
|
||||||
kfree(tk);
|
kfree(tk);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
get_task_struct(tsk);
|
get_task_struct(tsk);
|
||||||
tk->tsk = tsk;
|
tk->tsk = tsk;
|
||||||
list_add_tail(&tk->nd, to_kill);
|
list_add_tail(&tk->nd, to_kill);
|
||||||
|
@ -436,7 +431,7 @@ static struct task_struct *task_early_kill(struct task_struct *tsk,
|
||||||
* Collect processes when the error hit an anonymous page.
|
* Collect processes when the error hit an anonymous page.
|
||||||
*/
|
*/
|
||||||
static void collect_procs_anon(struct page *page, struct list_head *to_kill,
|
static void collect_procs_anon(struct page *page, struct list_head *to_kill,
|
||||||
struct to_kill **tkc, int force_early)
|
int force_early)
|
||||||
{
|
{
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
struct task_struct *tsk;
|
struct task_struct *tsk;
|
||||||
|
@ -461,7 +456,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
|
||||||
if (!page_mapped_in_vma(page, vma))
|
if (!page_mapped_in_vma(page, vma))
|
||||||
continue;
|
continue;
|
||||||
if (vma->vm_mm == t->mm)
|
if (vma->vm_mm == t->mm)
|
||||||
add_to_kill(t, page, vma, to_kill, tkc);
|
add_to_kill(t, page, vma, to_kill);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
read_unlock(&tasklist_lock);
|
read_unlock(&tasklist_lock);
|
||||||
|
@ -472,7 +467,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
|
||||||
* Collect processes when the error hit a file mapped page.
|
* Collect processes when the error hit a file mapped page.
|
||||||
*/
|
*/
|
||||||
static void collect_procs_file(struct page *page, struct list_head *to_kill,
|
static void collect_procs_file(struct page *page, struct list_head *to_kill,
|
||||||
struct to_kill **tkc, int force_early)
|
int force_early)
|
||||||
{
|
{
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
struct task_struct *tsk;
|
struct task_struct *tsk;
|
||||||
|
@ -496,7 +491,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
|
||||||
* to be informed of all such data corruptions.
|
* to be informed of all such data corruptions.
|
||||||
*/
|
*/
|
||||||
if (vma->vm_mm == t->mm)
|
if (vma->vm_mm == t->mm)
|
||||||
add_to_kill(t, page, vma, to_kill, tkc);
|
add_to_kill(t, page, vma, to_kill);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
read_unlock(&tasklist_lock);
|
read_unlock(&tasklist_lock);
|
||||||
|
@ -505,26 +500,17 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Collect the processes who have the corrupted page mapped to kill.
|
* Collect the processes who have the corrupted page mapped to kill.
|
||||||
* This is done in two steps for locking reasons.
|
|
||||||
* First preallocate one tokill structure outside the spin locks,
|
|
||||||
* so that we can kill at least one process reasonably reliable.
|
|
||||||
*/
|
*/
|
||||||
static void collect_procs(struct page *page, struct list_head *tokill,
|
static void collect_procs(struct page *page, struct list_head *tokill,
|
||||||
int force_early)
|
int force_early)
|
||||||
{
|
{
|
||||||
struct to_kill *tk;
|
|
||||||
|
|
||||||
if (!page->mapping)
|
if (!page->mapping)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
|
|
||||||
if (!tk)
|
|
||||||
return;
|
|
||||||
if (PageAnon(page))
|
if (PageAnon(page))
|
||||||
collect_procs_anon(page, tokill, &tk, force_early);
|
collect_procs_anon(page, tokill, force_early);
|
||||||
else
|
else
|
||||||
collect_procs_file(page, tokill, &tk, force_early);
|
collect_procs_file(page, tokill, force_early);
|
||||||
kfree(tk);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char *action_name[] = {
|
static const char *action_name[] = {
|
||||||
|
@ -1490,7 +1476,7 @@ static void memory_failure_work_func(struct work_struct *work)
|
||||||
if (!gotten)
|
if (!gotten)
|
||||||
break;
|
break;
|
||||||
if (entry.flags & MF_SOFT_OFFLINE)
|
if (entry.flags & MF_SOFT_OFFLINE)
|
||||||
soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
|
soft_offline_page(entry.pfn, entry.flags);
|
||||||
else
|
else
|
||||||
memory_failure(entry.pfn, entry.flags);
|
memory_failure(entry.pfn, entry.flags);
|
||||||
}
|
}
|
||||||
|
@ -1871,7 +1857,7 @@ static int soft_offline_free_page(struct page *page)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* soft_offline_page - Soft offline a page.
|
* soft_offline_page - Soft offline a page.
|
||||||
* @page: page to offline
|
* @pfn: pfn to soft-offline
|
||||||
* @flags: flags. Same as memory_failure().
|
* @flags: flags. Same as memory_failure().
|
||||||
*
|
*
|
||||||
* Returns 0 on success, otherwise negated errno.
|
* Returns 0 on success, otherwise negated errno.
|
||||||
|
@ -1891,18 +1877,17 @@ static int soft_offline_free_page(struct page *page)
|
||||||
* This is not a 100% solution for all memory, but tries to be
|
* This is not a 100% solution for all memory, but tries to be
|
||||||
* ``good enough'' for the majority of memory.
|
* ``good enough'' for the majority of memory.
|
||||||
*/
|
*/
|
||||||
int soft_offline_page(struct page *page, int flags)
|
int soft_offline_page(unsigned long pfn, int flags)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
unsigned long pfn = page_to_pfn(page);
|
struct page *page;
|
||||||
|
|
||||||
if (is_zone_device_page(page)) {
|
if (!pfn_valid(pfn))
|
||||||
pr_debug_ratelimited("soft_offline: %#lx page is device page\n",
|
return -ENXIO;
|
||||||
pfn);
|
/* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
|
||||||
if (flags & MF_COUNT_INCREASED)
|
page = pfn_to_online_page(pfn);
|
||||||
put_page(page);
|
if (!page)
|
||||||
return -EIO;
|
return -EIO;
|
||||||
}
|
|
||||||
|
|
||||||
if (PageHWPoison(page)) {
|
if (PageHWPoison(page)) {
|
||||||
pr_info("soft offline: %#lx page already poisoned\n", pfn);
|
pr_info("soft offline: %#lx page already poisoned\n", pfn);
|
||||||
|
|
56
mm/memory.c
56
mm/memory.c
|
@ -72,6 +72,8 @@
|
||||||
#include <linux/oom.h>
|
#include <linux/oom.h>
|
||||||
#include <linux/numa.h>
|
#include <linux/numa.h>
|
||||||
|
|
||||||
|
#include <trace/events/kmem.h>
|
||||||
|
|
||||||
#include <asm/io.h>
|
#include <asm/io.h>
|
||||||
#include <asm/mmu_context.h>
|
#include <asm/mmu_context.h>
|
||||||
#include <asm/pgalloc.h>
|
#include <asm/pgalloc.h>
|
||||||
|
@ -152,6 +154,10 @@ static int __init init_zero_pfn(void)
|
||||||
}
|
}
|
||||||
core_initcall(init_zero_pfn);
|
core_initcall(init_zero_pfn);
|
||||||
|
|
||||||
|
void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
|
||||||
|
{
|
||||||
|
trace_rss_stat(mm, member, count);
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(SPLIT_RSS_COUNTING)
|
#if defined(SPLIT_RSS_COUNTING)
|
||||||
|
|
||||||
|
@ -2289,10 +2295,11 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
|
||||||
*
|
*
|
||||||
* The function expects the page to be locked and unlocks it.
|
* The function expects the page to be locked and unlocks it.
|
||||||
*/
|
*/
|
||||||
static void fault_dirty_shared_page(struct vm_area_struct *vma,
|
static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
|
||||||
struct page *page)
|
|
||||||
{
|
{
|
||||||
|
struct vm_area_struct *vma = vmf->vma;
|
||||||
struct address_space *mapping;
|
struct address_space *mapping;
|
||||||
|
struct page *page = vmf->page;
|
||||||
bool dirtied;
|
bool dirtied;
|
||||||
bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
|
bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
|
||||||
|
|
||||||
|
@ -2307,16 +2314,30 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma,
|
||||||
mapping = page_rmapping(page);
|
mapping = page_rmapping(page);
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
|
|
||||||
if ((dirtied || page_mkwrite) && mapping) {
|
|
||||||
/*
|
|
||||||
* Some device drivers do not set page.mapping
|
|
||||||
* but still dirty their pages
|
|
||||||
*/
|
|
||||||
balance_dirty_pages_ratelimited(mapping);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!page_mkwrite)
|
if (!page_mkwrite)
|
||||||
file_update_time(vma->vm_file);
|
file_update_time(vma->vm_file);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Throttle page dirtying rate down to writeback speed.
|
||||||
|
*
|
||||||
|
* mapping may be NULL here because some device drivers do not
|
||||||
|
* set page.mapping but still dirty their pages
|
||||||
|
*
|
||||||
|
* Drop the mmap_sem before waiting on IO, if we can. The file
|
||||||
|
* is pinning the mapping, as per above.
|
||||||
|
*/
|
||||||
|
if ((dirtied || page_mkwrite) && mapping) {
|
||||||
|
struct file *fpin;
|
||||||
|
|
||||||
|
fpin = maybe_unlock_mmap_for_io(vmf, NULL);
|
||||||
|
balance_dirty_pages_ratelimited(mapping);
|
||||||
|
if (fpin) {
|
||||||
|
fput(fpin);
|
||||||
|
return VM_FAULT_RETRY;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -2571,6 +2592,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
|
||||||
__releases(vmf->ptl)
|
__releases(vmf->ptl)
|
||||||
{
|
{
|
||||||
struct vm_area_struct *vma = vmf->vma;
|
struct vm_area_struct *vma = vmf->vma;
|
||||||
|
vm_fault_t ret = VM_FAULT_WRITE;
|
||||||
|
|
||||||
get_page(vmf->page);
|
get_page(vmf->page);
|
||||||
|
|
||||||
|
@ -2594,10 +2616,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
|
||||||
wp_page_reuse(vmf);
|
wp_page_reuse(vmf);
|
||||||
lock_page(vmf->page);
|
lock_page(vmf->page);
|
||||||
}
|
}
|
||||||
fault_dirty_shared_page(vma, vmf->page);
|
ret |= fault_dirty_shared_page(vmf);
|
||||||
put_page(vmf->page);
|
put_page(vmf->page);
|
||||||
|
|
||||||
return VM_FAULT_WRITE;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -3083,7 +3105,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The memory barrier inside __SetPageUptodate makes sure that
|
* The memory barrier inside __SetPageUptodate makes sure that
|
||||||
* preceeding stores to the page contents become visible before
|
* preceding stores to the page contents become visible before
|
||||||
* the set_pte_at() write.
|
* the set_pte_at() write.
|
||||||
*/
|
*/
|
||||||
__SetPageUptodate(page);
|
__SetPageUptodate(page);
|
||||||
|
@ -3641,7 +3663,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
fault_dirty_shared_page(vma, vmf->page);
|
ret |= fault_dirty_shared_page(vmf);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3988,6 +4010,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
|
||||||
vmf.pud = pud_alloc(mm, p4d, address);
|
vmf.pud = pud_alloc(mm, p4d, address);
|
||||||
if (!vmf.pud)
|
if (!vmf.pud)
|
||||||
return VM_FAULT_OOM;
|
return VM_FAULT_OOM;
|
||||||
|
retry_pud:
|
||||||
if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
|
if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
|
||||||
ret = create_huge_pud(&vmf);
|
ret = create_huge_pud(&vmf);
|
||||||
if (!(ret & VM_FAULT_FALLBACK))
|
if (!(ret & VM_FAULT_FALLBACK))
|
||||||
|
@ -4014,6 +4037,11 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
|
||||||
vmf.pmd = pmd_alloc(mm, vmf.pud, address);
|
vmf.pmd = pmd_alloc(mm, vmf.pud, address);
|
||||||
if (!vmf.pmd)
|
if (!vmf.pmd)
|
||||||
return VM_FAULT_OOM;
|
return VM_FAULT_OOM;
|
||||||
|
|
||||||
|
/* Huge pud page fault raced with pmd_alloc? */
|
||||||
|
if (pud_trans_unstable(vmf.pud))
|
||||||
|
goto retry_pud;
|
||||||
|
|
||||||
if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
|
if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
|
||||||
ret = create_huge_pmd(&vmf);
|
ret = create_huge_pmd(&vmf);
|
||||||
if (!(ret & VM_FAULT_FALLBACK))
|
if (!(ret & VM_FAULT_FALLBACK))
|
||||||
|
|
|
@ -49,8 +49,6 @@
|
||||||
* and restore_online_page_callback() for generic callback restore.
|
* and restore_online_page_callback() for generic callback restore.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static void generic_online_page(struct page *page, unsigned int order);
|
|
||||||
|
|
||||||
static online_page_callback_t online_page_callback = generic_online_page;
|
static online_page_callback_t online_page_callback = generic_online_page;
|
||||||
static DEFINE_MUTEX(online_page_callback_lock);
|
static DEFINE_MUTEX(online_page_callback_lock);
|
||||||
|
|
||||||
|
@ -278,6 +276,22 @@ static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int check_hotplug_memory_addressable(unsigned long pfn,
|
||||||
|
unsigned long nr_pages)
|
||||||
|
{
|
||||||
|
const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1;
|
||||||
|
|
||||||
|
if (max_addr >> MAX_PHYSMEM_BITS) {
|
||||||
|
const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1;
|
||||||
|
WARN(1,
|
||||||
|
"Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n",
|
||||||
|
(u64)PFN_PHYS(pfn), max_addr, max_allowed);
|
||||||
|
return -E2BIG;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Reasonably generic function for adding memory. It is
|
* Reasonably generic function for adding memory. It is
|
||||||
* expected that archs that support memory hotplug will
|
* expected that archs that support memory hotplug will
|
||||||
|
@ -291,6 +305,10 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
|
||||||
unsigned long nr, start_sec, end_sec;
|
unsigned long nr, start_sec, end_sec;
|
||||||
struct vmem_altmap *altmap = restrictions->altmap;
|
struct vmem_altmap *altmap = restrictions->altmap;
|
||||||
|
|
||||||
|
err = check_hotplug_memory_addressable(pfn, nr_pages);
|
||||||
|
if (err)
|
||||||
|
return err;
|
||||||
|
|
||||||
if (altmap) {
|
if (altmap) {
|
||||||
/*
|
/*
|
||||||
* Validate altmap is within bounds of the total request
|
* Validate altmap is within bounds of the total request
|
||||||
|
@ -580,24 +598,7 @@ int restore_online_page_callback(online_page_callback_t callback)
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(restore_online_page_callback);
|
EXPORT_SYMBOL_GPL(restore_online_page_callback);
|
||||||
|
|
||||||
void __online_page_set_limits(struct page *page)
|
void generic_online_page(struct page *page, unsigned int order)
|
||||||
{
|
|
||||||
}
|
|
||||||
EXPORT_SYMBOL_GPL(__online_page_set_limits);
|
|
||||||
|
|
||||||
void __online_page_increment_counters(struct page *page)
|
|
||||||
{
|
|
||||||
adjust_managed_page_count(page, 1);
|
|
||||||
}
|
|
||||||
EXPORT_SYMBOL_GPL(__online_page_increment_counters);
|
|
||||||
|
|
||||||
void __online_page_free(struct page *page)
|
|
||||||
{
|
|
||||||
__free_reserved_page(page);
|
|
||||||
}
|
|
||||||
EXPORT_SYMBOL_GPL(__online_page_free);
|
|
||||||
|
|
||||||
static void generic_online_page(struct page *page, unsigned int order)
|
|
||||||
{
|
{
|
||||||
kernel_map_pages(page, 1 << order, 1);
|
kernel_map_pages(page, 1 << order, 1);
|
||||||
__free_pages_core(page, order);
|
__free_pages_core(page, order);
|
||||||
|
@ -607,6 +608,7 @@ static void generic_online_page(struct page *page, unsigned int order)
|
||||||
totalhigh_pages_add(1UL << order);
|
totalhigh_pages_add(1UL << order);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(generic_online_page);
|
||||||
|
|
||||||
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
|
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
|
||||||
void *arg)
|
void *arg)
|
||||||
|
@ -1180,7 +1182,8 @@ static bool is_pageblock_removable_nolock(unsigned long pfn)
|
||||||
if (!zone_spans_pfn(zone, pfn))
|
if (!zone_spans_pfn(zone, pfn))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, SKIP_HWPOISON);
|
return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE,
|
||||||
|
MEMORY_OFFLINE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Checks if this range of memory is likely to be hot-removable. */
|
/* Checks if this range of memory is likely to be hot-removable. */
|
||||||
|
@ -1377,9 +1380,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/* Mark all sections offline and remove all free pages from the buddy. */
|
||||||
* remove from free_area[] and mark all as Reserved.
|
|
||||||
*/
|
|
||||||
static int
|
static int
|
||||||
offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
|
offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
|
||||||
void *data)
|
void *data)
|
||||||
|
@ -1397,7 +1398,8 @@ static int
|
||||||
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
|
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
|
||||||
void *data)
|
void *data)
|
||||||
{
|
{
|
||||||
return test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
|
return test_pages_isolated(start_pfn, start_pfn + nr_pages,
|
||||||
|
MEMORY_OFFLINE);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int __init cmdline_parse_movable_node(char *p)
|
static int __init cmdline_parse_movable_node(char *p)
|
||||||
|
@ -1478,10 +1480,19 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
|
||||||
node_clear_state(node, N_MEMORY);
|
node_clear_state(node, N_MEMORY);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int count_system_ram_pages_cb(unsigned long start_pfn,
|
||||||
|
unsigned long nr_pages, void *data)
|
||||||
|
{
|
||||||
|
unsigned long *nr_system_ram_pages = data;
|
||||||
|
|
||||||
|
*nr_system_ram_pages += nr_pages;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static int __ref __offline_pages(unsigned long start_pfn,
|
static int __ref __offline_pages(unsigned long start_pfn,
|
||||||
unsigned long end_pfn)
|
unsigned long end_pfn)
|
||||||
{
|
{
|
||||||
unsigned long pfn, nr_pages;
|
unsigned long pfn, nr_pages = 0;
|
||||||
unsigned long offlined_pages = 0;
|
unsigned long offlined_pages = 0;
|
||||||
int ret, node, nr_isolate_pageblock;
|
int ret, node, nr_isolate_pageblock;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
@ -1492,6 +1503,22 @@ static int __ref __offline_pages(unsigned long start_pfn,
|
||||||
|
|
||||||
mem_hotplug_begin();
|
mem_hotplug_begin();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Don't allow to offline memory blocks that contain holes.
|
||||||
|
* Consequently, memory blocks with holes can never get onlined
|
||||||
|
* via the hotplug path - online_pages() - as hotplugged memory has
|
||||||
|
* no holes. This way, we e.g., don't have to worry about marking
|
||||||
|
* memory holes PG_reserved, don't need pfn_valid() checks, and can
|
||||||
|
* avoid using walk_system_ram_range() later.
|
||||||
|
*/
|
||||||
|
walk_system_ram_range(start_pfn, end_pfn - start_pfn, &nr_pages,
|
||||||
|
count_system_ram_pages_cb);
|
||||||
|
if (nr_pages != end_pfn - start_pfn) {
|
||||||
|
ret = -EINVAL;
|
||||||
|
reason = "memory holes";
|
||||||
|
goto failed_removal;
|
||||||
|
}
|
||||||
|
|
||||||
/* This makes hotplug much easier...and readable.
|
/* This makes hotplug much easier...and readable.
|
||||||
we assume this for now. .*/
|
we assume this for now. .*/
|
||||||
if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
|
if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
|
||||||
|
@ -1503,12 +1530,11 @@ static int __ref __offline_pages(unsigned long start_pfn,
|
||||||
|
|
||||||
zone = page_zone(pfn_to_page(valid_start));
|
zone = page_zone(pfn_to_page(valid_start));
|
||||||
node = zone_to_nid(zone);
|
node = zone_to_nid(zone);
|
||||||
nr_pages = end_pfn - start_pfn;
|
|
||||||
|
|
||||||
/* set above range as isolated */
|
/* set above range as isolated */
|
||||||
ret = start_isolate_page_range(start_pfn, end_pfn,
|
ret = start_isolate_page_range(start_pfn, end_pfn,
|
||||||
MIGRATE_MOVABLE,
|
MIGRATE_MOVABLE,
|
||||||
SKIP_HWPOISON | REPORT_FAILURE);
|
MEMORY_OFFLINE | REPORT_FAILURE);
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
reason = "failure to isolate range";
|
reason = "failure to isolate range";
|
||||||
goto failed_removal;
|
goto failed_removal;
|
||||||
|
@ -1750,13 +1776,13 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
|
||||||
|
|
||||||
/* remove memmap entry */
|
/* remove memmap entry */
|
||||||
firmware_map_remove(start, start + size, "System RAM");
|
firmware_map_remove(start, start + size, "System RAM");
|
||||||
memblock_free(start, size);
|
|
||||||
memblock_remove(start, size);
|
|
||||||
|
|
||||||
/* remove memory block devices before removing memory */
|
/* remove memory block devices before removing memory */
|
||||||
remove_memory_block_devices(start, size);
|
remove_memory_block_devices(start, size);
|
||||||
|
|
||||||
arch_remove_memory(nid, start, size, NULL);
|
arch_remove_memory(nid, start, size, NULL);
|
||||||
|
memblock_free(start, size);
|
||||||
|
memblock_remove(start, size);
|
||||||
__release_memory_resource(start, size);
|
__release_memory_resource(start, size);
|
||||||
|
|
||||||
try_offline_node(nid);
|
try_offline_node(nid);
|
||||||
|
|
|
@ -410,7 +410,9 @@ struct queue_pages {
|
||||||
struct list_head *pagelist;
|
struct list_head *pagelist;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
nodemask_t *nmask;
|
nodemask_t *nmask;
|
||||||
struct vm_area_struct *prev;
|
unsigned long start;
|
||||||
|
unsigned long end;
|
||||||
|
struct vm_area_struct *first;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -618,6 +620,22 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
|
||||||
unsigned long endvma = vma->vm_end;
|
unsigned long endvma = vma->vm_end;
|
||||||
unsigned long flags = qp->flags;
|
unsigned long flags = qp->flags;
|
||||||
|
|
||||||
|
/* range check first */
|
||||||
|
VM_BUG_ON((vma->vm_start > start) || (vma->vm_end < end));
|
||||||
|
|
||||||
|
if (!qp->first) {
|
||||||
|
qp->first = vma;
|
||||||
|
if (!(flags & MPOL_MF_DISCONTIG_OK) &&
|
||||||
|
(qp->start < vma->vm_start))
|
||||||
|
/* hole at head side of range */
|
||||||
|
return -EFAULT;
|
||||||
|
}
|
||||||
|
if (!(flags & MPOL_MF_DISCONTIG_OK) &&
|
||||||
|
((vma->vm_end < qp->end) &&
|
||||||
|
(!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
|
||||||
|
/* hole at middle or tail of range */
|
||||||
|
return -EFAULT;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Need check MPOL_MF_STRICT to return -EIO if possible
|
* Need check MPOL_MF_STRICT to return -EIO if possible
|
||||||
* regardless of vma_migratable
|
* regardless of vma_migratable
|
||||||
|
@ -628,17 +646,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
|
||||||
|
|
||||||
if (endvma > end)
|
if (endvma > end)
|
||||||
endvma = end;
|
endvma = end;
|
||||||
if (vma->vm_start > start)
|
|
||||||
start = vma->vm_start;
|
|
||||||
|
|
||||||
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
|
|
||||||
if (!vma->vm_next && vma->vm_end < end)
|
|
||||||
return -EFAULT;
|
|
||||||
if (qp->prev && qp->prev->vm_end < vma->vm_start)
|
|
||||||
return -EFAULT;
|
|
||||||
}
|
|
||||||
|
|
||||||
qp->prev = vma;
|
|
||||||
|
|
||||||
if (flags & MPOL_MF_LAZY) {
|
if (flags & MPOL_MF_LAZY) {
|
||||||
/* Similar to task_numa_work, skip inaccessible VMAs */
|
/* Similar to task_numa_work, skip inaccessible VMAs */
|
||||||
|
@ -681,14 +688,23 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
|
||||||
nodemask_t *nodes, unsigned long flags,
|
nodemask_t *nodes, unsigned long flags,
|
||||||
struct list_head *pagelist)
|
struct list_head *pagelist)
|
||||||
{
|
{
|
||||||
|
int err;
|
||||||
struct queue_pages qp = {
|
struct queue_pages qp = {
|
||||||
.pagelist = pagelist,
|
.pagelist = pagelist,
|
||||||
.flags = flags,
|
.flags = flags,
|
||||||
.nmask = nodes,
|
.nmask = nodes,
|
||||||
.prev = NULL,
|
.start = start,
|
||||||
|
.end = end,
|
||||||
|
.first = NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
|
err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
|
||||||
|
|
||||||
|
if (!qp.first)
|
||||||
|
/* whole range in hole */
|
||||||
|
err = -EFAULT;
|
||||||
|
|
||||||
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -740,8 +756,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
|
||||||
unsigned long vmend;
|
unsigned long vmend;
|
||||||
|
|
||||||
vma = find_vma(mm, start);
|
vma = find_vma(mm, start);
|
||||||
if (!vma || vma->vm_start > start)
|
VM_BUG_ON(!vma);
|
||||||
return -EFAULT;
|
|
||||||
|
|
||||||
prev = vma->vm_prev;
|
prev = vma->vm_prev;
|
||||||
if (start > vma->vm_start)
|
if (start > vma->vm_start)
|
||||||
|
|
16
mm/migrate.c
16
mm/migrate.c
|
@ -1168,15 +1168,11 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
|
||||||
enum migrate_reason reason)
|
enum migrate_reason reason)
|
||||||
{
|
{
|
||||||
int rc = MIGRATEPAGE_SUCCESS;
|
int rc = MIGRATEPAGE_SUCCESS;
|
||||||
struct page *newpage;
|
struct page *newpage = NULL;
|
||||||
|
|
||||||
if (!thp_migration_supported() && PageTransHuge(page))
|
if (!thp_migration_supported() && PageTransHuge(page))
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
newpage = get_new_page(page, private);
|
|
||||||
if (!newpage)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
if (page_count(page) == 1) {
|
if (page_count(page) == 1) {
|
||||||
/* page was freed from under us. So we are done. */
|
/* page was freed from under us. So we are done. */
|
||||||
ClearPageActive(page);
|
ClearPageActive(page);
|
||||||
|
@ -1187,13 +1183,13 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
|
||||||
__ClearPageIsolated(page);
|
__ClearPageIsolated(page);
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
}
|
}
|
||||||
if (put_new_page)
|
|
||||||
put_new_page(newpage, private);
|
|
||||||
else
|
|
||||||
put_page(newpage);
|
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
newpage = get_new_page(page, private);
|
||||||
|
if (!newpage)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
rc = __unmap_and_move(page, newpage, force, mode);
|
rc = __unmap_and_move(page, newpage, force, mode);
|
||||||
if (rc == MIGRATEPAGE_SUCCESS)
|
if (rc == MIGRATEPAGE_SUCCESS)
|
||||||
set_page_owner_migrate_reason(newpage, reason);
|
set_page_owner_migrate_reason(newpage, reason);
|
||||||
|
@ -1863,7 +1859,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
|
||||||
if (!zone_watermark_ok(zone, 0,
|
if (!zone_watermark_ok(zone, 0,
|
||||||
high_wmark_pages(zone) +
|
high_wmark_pages(zone) +
|
||||||
nr_migrate_pages,
|
nr_migrate_pages,
|
||||||
0, 0))
|
ZONE_MOVABLE, 0))
|
||||||
continue;
|
continue;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
63
mm/mmap.c
63
mm/mmap.c
|
@ -641,7 +641,7 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
struct vm_area_struct *prev, struct rb_node **rb_link,
|
struct vm_area_struct *prev, struct rb_node **rb_link,
|
||||||
struct rb_node *rb_parent)
|
struct rb_node *rb_parent)
|
||||||
{
|
{
|
||||||
__vma_link_list(mm, vma, prev, rb_parent);
|
__vma_link_list(mm, vma, prev);
|
||||||
__vma_link_rb(mm, vma, rb_link, rb_parent);
|
__vma_link_rb(mm, vma, rb_link, rb_parent);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -684,37 +684,14 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
|
||||||
|
|
||||||
static __always_inline void __vma_unlink_common(struct mm_struct *mm,
|
static __always_inline void __vma_unlink_common(struct mm_struct *mm,
|
||||||
struct vm_area_struct *vma,
|
struct vm_area_struct *vma,
|
||||||
struct vm_area_struct *prev,
|
|
||||||
bool has_prev,
|
|
||||||
struct vm_area_struct *ignore)
|
struct vm_area_struct *ignore)
|
||||||
{
|
{
|
||||||
struct vm_area_struct *next;
|
|
||||||
|
|
||||||
vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
|
vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
|
||||||
next = vma->vm_next;
|
__vma_unlink_list(mm, vma);
|
||||||
if (has_prev)
|
|
||||||
prev->vm_next = next;
|
|
||||||
else {
|
|
||||||
prev = vma->vm_prev;
|
|
||||||
if (prev)
|
|
||||||
prev->vm_next = next;
|
|
||||||
else
|
|
||||||
mm->mmap = next;
|
|
||||||
}
|
|
||||||
if (next)
|
|
||||||
next->vm_prev = prev;
|
|
||||||
|
|
||||||
/* Kill the cache */
|
/* Kill the cache */
|
||||||
vmacache_invalidate(mm);
|
vmacache_invalidate(mm);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void __vma_unlink_prev(struct mm_struct *mm,
|
|
||||||
struct vm_area_struct *vma,
|
|
||||||
struct vm_area_struct *prev)
|
|
||||||
{
|
|
||||||
__vma_unlink_common(mm, vma, prev, true, vma);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
|
* We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
|
||||||
* is already present in an i_mmap tree without adjusting the tree.
|
* is already present in an i_mmap tree without adjusting the tree.
|
||||||
|
@ -769,8 +746,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
|
||||||
remove_next = 1 + (end > next->vm_end);
|
remove_next = 1 + (end > next->vm_end);
|
||||||
VM_WARN_ON(remove_next == 2 &&
|
VM_WARN_ON(remove_next == 2 &&
|
||||||
end != next->vm_next->vm_end);
|
end != next->vm_next->vm_end);
|
||||||
VM_WARN_ON(remove_next == 1 &&
|
|
||||||
end != next->vm_end);
|
|
||||||
/* trim end to next, for case 6 first pass */
|
/* trim end to next, for case 6 first pass */
|
||||||
end = next->vm_end;
|
end = next->vm_end;
|
||||||
}
|
}
|
||||||
|
@ -889,7 +864,7 @@ again:
|
||||||
* us to remove next before dropping the locks.
|
* us to remove next before dropping the locks.
|
||||||
*/
|
*/
|
||||||
if (remove_next != 3)
|
if (remove_next != 3)
|
||||||
__vma_unlink_prev(mm, next, vma);
|
__vma_unlink_common(mm, next, next);
|
||||||
else
|
else
|
||||||
/*
|
/*
|
||||||
* vma is not before next if they've been
|
* vma is not before next if they've been
|
||||||
|
@ -900,7 +875,7 @@ again:
|
||||||
* "next" (which is stored in post-swap()
|
* "next" (which is stored in post-swap()
|
||||||
* "vma").
|
* "vma").
|
||||||
*/
|
*/
|
||||||
__vma_unlink_common(mm, next, NULL, false, vma);
|
__vma_unlink_common(mm, next, vma);
|
||||||
if (file)
|
if (file)
|
||||||
__remove_shared_vm_struct(next, file, mapping);
|
__remove_shared_vm_struct(next, file, mapping);
|
||||||
} else if (insert) {
|
} else if (insert) {
|
||||||
|
@ -1116,15 +1091,18 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
|
||||||
* the area passed down from mprotect_fixup, never extending beyond one
|
* the area passed down from mprotect_fixup, never extending beyond one
|
||||||
* vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
|
* vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
|
||||||
*
|
*
|
||||||
* AAAA AAAA AAAA AAAA
|
* AAAA AAAA AAAA
|
||||||
* PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX
|
* PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN
|
||||||
* cannot merge might become might become might become
|
* cannot merge might become might become
|
||||||
* PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or
|
* PPNNNNNNNNNN PPPPPPPPPPNN
|
||||||
* mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or
|
* mmap, brk or case 4 below case 5 below
|
||||||
* mremap move: PPPPXXXXXXXX 8
|
* mremap move:
|
||||||
* AAAA
|
* AAAA AAAA
|
||||||
* PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
|
* PPPP NNNN PPPPNNNNXXXX
|
||||||
* might become case 1 below case 2 below case 3 below
|
* might become might become
|
||||||
|
* PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or
|
||||||
|
* PPPPPPPPNNNN 2 or PPPPPPPPXXXX 7 or
|
||||||
|
* PPPPNNNNNNNN 3 PPPPXXXXXXXX 8
|
||||||
*
|
*
|
||||||
* It is important for case 8 that the vma NNNN overlapping the
|
* It is important for case 8 that the vma NNNN overlapping the
|
||||||
* region AAAA is never going to extended over XXXX. Instead XXXX must
|
* region AAAA is never going to extended over XXXX. Instead XXXX must
|
||||||
|
@ -1442,7 +1420,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
|
||||||
* that it represents a valid section of the address space.
|
* that it represents a valid section of the address space.
|
||||||
*/
|
*/
|
||||||
addr = get_unmapped_area(file, addr, len, pgoff, flags);
|
addr = get_unmapped_area(file, addr, len, pgoff, flags);
|
||||||
if (offset_in_page(addr))
|
if (IS_ERR_VALUE(addr))
|
||||||
return addr;
|
return addr;
|
||||||
|
|
||||||
if (flags & MAP_FIXED_NOREPLACE) {
|
if (flags & MAP_FIXED_NOREPLACE) {
|
||||||
|
@ -3006,15 +2984,16 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
|
||||||
struct rb_node **rb_link, *rb_parent;
|
struct rb_node **rb_link, *rb_parent;
|
||||||
pgoff_t pgoff = addr >> PAGE_SHIFT;
|
pgoff_t pgoff = addr >> PAGE_SHIFT;
|
||||||
int error;
|
int error;
|
||||||
|
unsigned long mapped_addr;
|
||||||
|
|
||||||
/* Until we need other flags, refuse anything except VM_EXEC. */
|
/* Until we need other flags, refuse anything except VM_EXEC. */
|
||||||
if ((flags & (~VM_EXEC)) != 0)
|
if ((flags & (~VM_EXEC)) != 0)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
|
flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
|
||||||
|
|
||||||
error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
|
mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
|
||||||
if (offset_in_page(error))
|
if (IS_ERR_VALUE(mapped_addr))
|
||||||
return error;
|
return mapped_addr;
|
||||||
|
|
||||||
error = mlock_future_check(mm, mm->def_flags, len);
|
error = mlock_future_check(mm, mm->def_flags, len);
|
||||||
if (error)
|
if (error)
|
||||||
|
|
|
@ -80,6 +80,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
|
||||||
if (prot_numa) {
|
if (prot_numa) {
|
||||||
struct page *page;
|
struct page *page;
|
||||||
|
|
||||||
|
/* Avoid TLB flush if possible */
|
||||||
|
if (pte_protnone(oldpte))
|
||||||
|
continue;
|
||||||
|
|
||||||
page = vm_normal_page(vma, addr, oldpte);
|
page = vm_normal_page(vma, addr, oldpte);
|
||||||
if (!page || PageKsm(page))
|
if (!page || PageKsm(page))
|
||||||
continue;
|
continue;
|
||||||
|
@ -97,10 +101,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
|
||||||
if (page_is_file_cache(page) && PageDirty(page))
|
if (page_is_file_cache(page) && PageDirty(page))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
/* Avoid TLB flush if possible */
|
|
||||||
if (pte_protnone(oldpte))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Don't mess with PTEs if page is already on the node
|
* Don't mess with PTEs if page is already on the node
|
||||||
* a single-threaded process is running on.
|
* a single-threaded process is running on.
|
||||||
|
|
|
@ -558,7 +558,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
|
||||||
ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
|
ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
|
||||||
((addr - vma->vm_start) >> PAGE_SHIFT),
|
((addr - vma->vm_start) >> PAGE_SHIFT),
|
||||||
map_flags);
|
map_flags);
|
||||||
if (offset_in_page(ret))
|
if (IS_ERR_VALUE(ret))
|
||||||
goto out1;
|
goto out1;
|
||||||
|
|
||||||
ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
|
ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
|
||||||
|
@ -706,7 +706,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
|
||||||
vma->vm_pgoff +
|
vma->vm_pgoff +
|
||||||
((addr - vma->vm_start) >> PAGE_SHIFT),
|
((addr - vma->vm_start) >> PAGE_SHIFT),
|
||||||
map_flags);
|
map_flags);
|
||||||
if (offset_in_page(new_addr)) {
|
if (IS_ERR_VALUE(new_addr)) {
|
||||||
ret = new_addr;
|
ret = new_addr;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
10
mm/nommu.c
10
mm/nommu.c
|
@ -648,7 +648,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
|
||||||
if (rb_prev)
|
if (rb_prev)
|
||||||
prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
|
prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
|
||||||
|
|
||||||
__vma_link_list(mm, vma, prev, parent);
|
__vma_link_list(mm, vma, prev);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -684,13 +684,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
|
||||||
/* remove from the MM's tree and list */
|
/* remove from the MM's tree and list */
|
||||||
rb_erase(&vma->vm_rb, &mm->mm_rb);
|
rb_erase(&vma->vm_rb, &mm->mm_rb);
|
||||||
|
|
||||||
if (vma->vm_prev)
|
__vma_unlink_list(mm, vma);
|
||||||
vma->vm_prev->vm_next = vma->vm_next;
|
|
||||||
else
|
|
||||||
mm->mmap = vma->vm_next;
|
|
||||||
|
|
||||||
if (vma->vm_next)
|
|
||||||
vma->vm_next->vm_prev = vma->vm_prev;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
137
mm/page_alloc.c
137
mm/page_alloc.c
|
@ -5354,6 +5354,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
|
||||||
" min:%lukB"
|
" min:%lukB"
|
||||||
" low:%lukB"
|
" low:%lukB"
|
||||||
" high:%lukB"
|
" high:%lukB"
|
||||||
|
" reserved_highatomic:%luKB"
|
||||||
" active_anon:%lukB"
|
" active_anon:%lukB"
|
||||||
" inactive_anon:%lukB"
|
" inactive_anon:%lukB"
|
||||||
" active_file:%lukB"
|
" active_file:%lukB"
|
||||||
|
@ -5375,6 +5376,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
|
||||||
K(min_wmark_pages(zone)),
|
K(min_wmark_pages(zone)),
|
||||||
K(low_wmark_pages(zone)),
|
K(low_wmark_pages(zone)),
|
||||||
K(high_wmark_pages(zone)),
|
K(high_wmark_pages(zone)),
|
||||||
|
K(zone->nr_reserved_highatomic),
|
||||||
K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
|
K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
|
||||||
K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
|
K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
|
||||||
K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
|
K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
|
||||||
|
@ -6711,7 +6713,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
|
||||||
|
|
||||||
pgdat_page_ext_init(pgdat);
|
pgdat_page_ext_init(pgdat);
|
||||||
spin_lock_init(&pgdat->lru_lock);
|
spin_lock_init(&pgdat->lru_lock);
|
||||||
lruvec_init(node_lruvec(pgdat));
|
lruvec_init(&pgdat->__lruvec);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
|
static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
|
||||||
|
@ -7988,6 +7990,15 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void __zone_pcp_update(struct zone *zone)
|
||||||
|
{
|
||||||
|
unsigned int cpu;
|
||||||
|
|
||||||
|
for_each_possible_cpu(cpu)
|
||||||
|
pageset_set_high_and_batch(zone,
|
||||||
|
per_cpu_ptr(zone->pageset, cpu));
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* percpu_pagelist_fraction - changes the pcp->high for each zone on each
|
* percpu_pagelist_fraction - changes the pcp->high for each zone on each
|
||||||
* cpu. It is the fraction of total pages in each zone that a hot per cpu
|
* cpu. It is the fraction of total pages in each zone that a hot per cpu
|
||||||
|
@ -8019,13 +8030,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
|
||||||
if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
|
if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
for_each_populated_zone(zone) {
|
for_each_populated_zone(zone)
|
||||||
unsigned int cpu;
|
__zone_pcp_update(zone);
|
||||||
|
|
||||||
for_each_possible_cpu(cpu)
|
|
||||||
pageset_set_high_and_batch(zone,
|
|
||||||
per_cpu_ptr(zone->pageset, cpu));
|
|
||||||
}
|
|
||||||
out:
|
out:
|
||||||
mutex_unlock(&pcp_batch_high_lock);
|
mutex_unlock(&pcp_batch_high_lock);
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -8261,7 +8267,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
|
||||||
* The HWPoisoned page may be not in buddy system, and
|
* The HWPoisoned page may be not in buddy system, and
|
||||||
* page_count() is not 0.
|
* page_count() is not 0.
|
||||||
*/
|
*/
|
||||||
if ((flags & SKIP_HWPOISON) && PageHWPoison(page))
|
if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (__PageMovable(page))
|
if (__PageMovable(page))
|
||||||
|
@ -8477,7 +8483,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Make sure the range is really isolated. */
|
/* Make sure the range is really isolated. */
|
||||||
if (test_pages_isolated(outer_start, end, false)) {
|
if (test_pages_isolated(outer_start, end, 0)) {
|
||||||
pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
|
pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
|
||||||
__func__, outer_start, end);
|
__func__, outer_start, end);
|
||||||
ret = -EBUSY;
|
ret = -EBUSY;
|
||||||
|
@ -8502,6 +8508,107 @@ done:
|
||||||
pfn_max_align_up(end), migratetype);
|
pfn_max_align_up(end), migratetype);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int __alloc_contig_pages(unsigned long start_pfn,
|
||||||
|
unsigned long nr_pages, gfp_t gfp_mask)
|
||||||
|
{
|
||||||
|
unsigned long end_pfn = start_pfn + nr_pages;
|
||||||
|
|
||||||
|
return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
|
||||||
|
gfp_mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
|
||||||
|
unsigned long nr_pages)
|
||||||
|
{
|
||||||
|
unsigned long i, end_pfn = start_pfn + nr_pages;
|
||||||
|
struct page *page;
|
||||||
|
|
||||||
|
for (i = start_pfn; i < end_pfn; i++) {
|
||||||
|
page = pfn_to_online_page(i);
|
||||||
|
if (!page)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (page_zone(page) != z)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (PageReserved(page))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (page_count(page) > 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (PageHuge(page))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool zone_spans_last_pfn(const struct zone *zone,
|
||||||
|
unsigned long start_pfn, unsigned long nr_pages)
|
||||||
|
{
|
||||||
|
unsigned long last_pfn = start_pfn + nr_pages - 1;
|
||||||
|
|
||||||
|
return zone_spans_pfn(zone, last_pfn);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* alloc_contig_pages() -- tries to find and allocate contiguous range of pages
|
||||||
|
* @nr_pages: Number of contiguous pages to allocate
|
||||||
|
* @gfp_mask: GFP mask to limit search and used during compaction
|
||||||
|
* @nid: Target node
|
||||||
|
* @nodemask: Mask for other possible nodes
|
||||||
|
*
|
||||||
|
* This routine is a wrapper around alloc_contig_range(). It scans over zones
|
||||||
|
* on an applicable zonelist to find a contiguous pfn range which can then be
|
||||||
|
* tried for allocation with alloc_contig_range(). This routine is intended
|
||||||
|
* for allocation requests which can not be fulfilled with the buddy allocator.
|
||||||
|
*
|
||||||
|
* The allocated memory is always aligned to a page boundary. If nr_pages is a
|
||||||
|
* power of two then the alignment is guaranteed to be to the given nr_pages
|
||||||
|
* (e.g. 1GB request would be aligned to 1GB).
|
||||||
|
*
|
||||||
|
* Allocated pages can be freed with free_contig_range() or by manually calling
|
||||||
|
* __free_page() on each allocated page.
|
||||||
|
*
|
||||||
|
* Return: pointer to contiguous pages on success, or NULL if not successful.
|
||||||
|
*/
|
||||||
|
struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
|
||||||
|
int nid, nodemask_t *nodemask)
|
||||||
|
{
|
||||||
|
unsigned long ret, pfn, flags;
|
||||||
|
struct zonelist *zonelist;
|
||||||
|
struct zone *zone;
|
||||||
|
struct zoneref *z;
|
||||||
|
|
||||||
|
zonelist = node_zonelist(nid, gfp_mask);
|
||||||
|
for_each_zone_zonelist_nodemask(zone, z, zonelist,
|
||||||
|
gfp_zone(gfp_mask), nodemask) {
|
||||||
|
spin_lock_irqsave(&zone->lock, flags);
|
||||||
|
|
||||||
|
pfn = ALIGN(zone->zone_start_pfn, nr_pages);
|
||||||
|
while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
|
||||||
|
if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
|
||||||
|
/*
|
||||||
|
* We release the zone lock here because
|
||||||
|
* alloc_contig_range() will also lock the zone
|
||||||
|
* at some point. If there's an allocation
|
||||||
|
* spinning on this lock, it may win the race
|
||||||
|
* and cause alloc_contig_range() to fail...
|
||||||
|
*/
|
||||||
|
spin_unlock_irqrestore(&zone->lock, flags);
|
||||||
|
ret = __alloc_contig_pages(pfn, nr_pages,
|
||||||
|
gfp_mask);
|
||||||
|
if (!ret)
|
||||||
|
return pfn_to_page(pfn);
|
||||||
|
spin_lock_irqsave(&zone->lock, flags);
|
||||||
|
}
|
||||||
|
pfn += nr_pages;
|
||||||
|
}
|
||||||
|
spin_unlock_irqrestore(&zone->lock, flags);
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
#endif /* CONFIG_CONTIG_ALLOC */
|
#endif /* CONFIG_CONTIG_ALLOC */
|
||||||
|
|
||||||
void free_contig_range(unsigned long pfn, unsigned int nr_pages)
|
void free_contig_range(unsigned long pfn, unsigned int nr_pages)
|
||||||
|
@ -8523,11 +8630,8 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages)
|
||||||
*/
|
*/
|
||||||
void __meminit zone_pcp_update(struct zone *zone)
|
void __meminit zone_pcp_update(struct zone *zone)
|
||||||
{
|
{
|
||||||
unsigned cpu;
|
|
||||||
mutex_lock(&pcp_batch_high_lock);
|
mutex_lock(&pcp_batch_high_lock);
|
||||||
for_each_possible_cpu(cpu)
|
__zone_pcp_update(zone);
|
||||||
pageset_set_high_and_batch(zone,
|
|
||||||
per_cpu_ptr(zone->pageset, cpu));
|
|
||||||
mutex_unlock(&pcp_batch_high_lock);
|
mutex_unlock(&pcp_batch_high_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8560,7 +8664,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
|
||||||
{
|
{
|
||||||
struct page *page;
|
struct page *page;
|
||||||
struct zone *zone;
|
struct zone *zone;
|
||||||
unsigned int order, i;
|
unsigned int order;
|
||||||
unsigned long pfn;
|
unsigned long pfn;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
unsigned long offlined_pages = 0;
|
unsigned long offlined_pages = 0;
|
||||||
|
@ -8588,7 +8692,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
|
||||||
*/
|
*/
|
||||||
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
|
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
|
||||||
pfn++;
|
pfn++;
|
||||||
SetPageReserved(page);
|
|
||||||
offlined_pages++;
|
offlined_pages++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -8602,8 +8705,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
|
||||||
pfn, 1 << order, end_pfn);
|
pfn, 1 << order, end_pfn);
|
||||||
#endif
|
#endif
|
||||||
del_page_from_free_area(page, &zone->free_area[order]);
|
del_page_from_free_area(page, &zone->free_area[order]);
|
||||||
for (i = 0; i < (1 << order); i++)
|
|
||||||
SetPageReserved((page+i));
|
|
||||||
pfn += (1 << order);
|
pfn += (1 << order);
|
||||||
}
|
}
|
||||||
spin_unlock_irqrestore(&zone->lock, flags);
|
spin_unlock_irqrestore(&zone->lock, flags);
|
||||||
|
|
15
mm/page_io.c
15
mm/page_io.c
|
@ -22,6 +22,7 @@
|
||||||
#include <linux/writeback.h>
|
#include <linux/writeback.h>
|
||||||
#include <linux/frontswap.h>
|
#include <linux/frontswap.h>
|
||||||
#include <linux/blkdev.h>
|
#include <linux/blkdev.h>
|
||||||
|
#include <linux/psi.h>
|
||||||
#include <linux/uio.h>
|
#include <linux/uio.h>
|
||||||
#include <linux/sched/task.h>
|
#include <linux/sched/task.h>
|
||||||
#include <asm/pgtable.h>
|
#include <asm/pgtable.h>
|
||||||
|
@ -354,10 +355,19 @@ int swap_readpage(struct page *page, bool synchronous)
|
||||||
struct swap_info_struct *sis = page_swap_info(page);
|
struct swap_info_struct *sis = page_swap_info(page);
|
||||||
blk_qc_t qc;
|
blk_qc_t qc;
|
||||||
struct gendisk *disk;
|
struct gendisk *disk;
|
||||||
|
unsigned long pflags;
|
||||||
|
|
||||||
VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
|
VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
|
||||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||||
VM_BUG_ON_PAGE(PageUptodate(page), page);
|
VM_BUG_ON_PAGE(PageUptodate(page), page);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Count submission time as memory stall. When the device is congested,
|
||||||
|
* or the submitting cgroup IO-throttled, submission can be a
|
||||||
|
* significant part of overall IO time.
|
||||||
|
*/
|
||||||
|
psi_memstall_enter(&pflags);
|
||||||
|
|
||||||
if (frontswap_load(page) == 0) {
|
if (frontswap_load(page) == 0) {
|
||||||
SetPageUptodate(page);
|
SetPageUptodate(page);
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
|
@ -371,7 +381,7 @@ int swap_readpage(struct page *page, bool synchronous)
|
||||||
ret = mapping->a_ops->readpage(swap_file, page);
|
ret = mapping->a_ops->readpage(swap_file, page);
|
||||||
if (!ret)
|
if (!ret)
|
||||||
count_vm_event(PSWPIN);
|
count_vm_event(PSWPIN);
|
||||||
return ret;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
|
ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
|
||||||
|
@ -382,7 +392,7 @@ int swap_readpage(struct page *page, bool synchronous)
|
||||||
}
|
}
|
||||||
|
|
||||||
count_vm_event(PSWPIN);
|
count_vm_event(PSWPIN);
|
||||||
return 0;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = 0;
|
ret = 0;
|
||||||
|
@ -418,6 +428,7 @@ int swap_readpage(struct page *page, bool synchronous)
|
||||||
bio_put(bio);
|
bio_put(bio);
|
||||||
|
|
||||||
out:
|
out:
|
||||||
|
psi_memstall_leave(&pflags);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -168,7 +168,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
|
||||||
* @migratetype: Migrate type to set in error recovery.
|
* @migratetype: Migrate type to set in error recovery.
|
||||||
* @flags: The following flags are allowed (they can be combined in
|
* @flags: The following flags are allowed (they can be combined in
|
||||||
* a bit mask)
|
* a bit mask)
|
||||||
* SKIP_HWPOISON - ignore hwpoison pages
|
* MEMORY_OFFLINE - isolate to offline (!allocate) memory
|
||||||
|
* e.g., skip over PageHWPoison() pages
|
||||||
* REPORT_FAILURE - report details about the failure to
|
* REPORT_FAILURE - report details about the failure to
|
||||||
* isolate the range
|
* isolate the range
|
||||||
*
|
*
|
||||||
|
@ -257,7 +258,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
|
||||||
*/
|
*/
|
||||||
static unsigned long
|
static unsigned long
|
||||||
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
|
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
|
||||||
bool skip_hwpoisoned_pages)
|
int flags)
|
||||||
{
|
{
|
||||||
struct page *page;
|
struct page *page;
|
||||||
|
|
||||||
|
@ -274,7 +275,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
|
||||||
* simple way to verify that as VM_BUG_ON(), though.
|
* simple way to verify that as VM_BUG_ON(), though.
|
||||||
*/
|
*/
|
||||||
pfn += 1 << page_order(page);
|
pfn += 1 << page_order(page);
|
||||||
else if (skip_hwpoisoned_pages && PageHWPoison(page))
|
else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
|
||||||
/* A HWPoisoned page cannot be also PageBuddy */
|
/* A HWPoisoned page cannot be also PageBuddy */
|
||||||
pfn++;
|
pfn++;
|
||||||
else
|
else
|
||||||
|
@ -286,7 +287,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
|
||||||
|
|
||||||
/* Caller should ensure that requested range is in a single zone */
|
/* Caller should ensure that requested range is in a single zone */
|
||||||
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
|
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
|
||||||
bool skip_hwpoisoned_pages)
|
int isol_flags)
|
||||||
{
|
{
|
||||||
unsigned long pfn, flags;
|
unsigned long pfn, flags;
|
||||||
struct page *page;
|
struct page *page;
|
||||||
|
@ -308,8 +309,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
|
||||||
/* Check all pages are free or marked as ISOLATED */
|
/* Check all pages are free or marked as ISOLATED */
|
||||||
zone = page_zone(page);
|
zone = page_zone(page);
|
||||||
spin_lock_irqsave(&zone->lock, flags);
|
spin_lock_irqsave(&zone->lock, flags);
|
||||||
pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
|
pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags);
|
||||||
skip_hwpoisoned_pages);
|
|
||||||
spin_unlock_irqrestore(&zone->lock, flags);
|
spin_unlock_irqrestore(&zone->lock, flags);
|
||||||
|
|
||||||
trace_test_pages_isolated(start_pfn, end_pfn, pfn);
|
trace_test_pages_isolated(start_pfn, end_pfn, pfn);
|
||||||
|
|
|
@ -24,18 +24,27 @@ void pgd_clear_bad(pgd_t *pgd)
|
||||||
pgd_clear(pgd);
|
pgd_clear(pgd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef __PAGETABLE_P4D_FOLDED
|
||||||
void p4d_clear_bad(p4d_t *p4d)
|
void p4d_clear_bad(p4d_t *p4d)
|
||||||
{
|
{
|
||||||
p4d_ERROR(*p4d);
|
p4d_ERROR(*p4d);
|
||||||
p4d_clear(p4d);
|
p4d_clear(p4d);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __PAGETABLE_PUD_FOLDED
|
||||||
void pud_clear_bad(pud_t *pud)
|
void pud_clear_bad(pud_t *pud)
|
||||||
{
|
{
|
||||||
pud_ERROR(*pud);
|
pud_ERROR(*pud);
|
||||||
pud_clear(pud);
|
pud_clear(pud);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note that the pmd variant below can't be stub'ed out just as for p4d/pud
|
||||||
|
* above. pmd folding is special and typically pmd_* macros refer to upper
|
||||||
|
* level even when folded
|
||||||
|
*/
|
||||||
void pmd_clear_bad(pmd_t *pmd)
|
void pmd_clear_bad(pmd_t *pmd)
|
||||||
{
|
{
|
||||||
pmd_ERROR(*pmd);
|
pmd_ERROR(*pmd);
|
||||||
|
|
65
mm/rmap.c
65
mm/rmap.c
|
@ -251,18 +251,37 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
|
||||||
* Attach the anon_vmas from src to dst.
|
* Attach the anon_vmas from src to dst.
|
||||||
* Returns 0 on success, -ENOMEM on failure.
|
* Returns 0 on success, -ENOMEM on failure.
|
||||||
*
|
*
|
||||||
* If dst->anon_vma is NULL this function tries to find and reuse existing
|
* anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
|
||||||
* anon_vma which has no vmas and only one child anon_vma. This prevents
|
* anon_vma_fork(). The first three want an exact copy of src, while the last
|
||||||
* degradation of anon_vma hierarchy to endless linear chain in case of
|
* one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
|
||||||
* constantly forking task. On the other hand, an anon_vma with more than one
|
* endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
|
||||||
* child isn't reused even if there was no alive vma, thus rmap walker has a
|
* we can identify this case by checking (!dst->anon_vma && src->anon_vma).
|
||||||
* good chance of avoiding scanning the whole hierarchy when it searches where
|
*
|
||||||
* page is mapped.
|
* If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
|
||||||
|
* and reuse existing anon_vma which has no vmas and only one child anon_vma.
|
||||||
|
* This prevents degradation of anon_vma hierarchy to endless linear chain in
|
||||||
|
* case of constantly forking task. On the other hand, an anon_vma with more
|
||||||
|
* than one child isn't reused even if there was no alive vma, thus rmap
|
||||||
|
* walker has a good chance of avoiding scanning the whole hierarchy when it
|
||||||
|
* searches where page is mapped.
|
||||||
*/
|
*/
|
||||||
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
|
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
|
||||||
{
|
{
|
||||||
struct anon_vma_chain *avc, *pavc;
|
struct anon_vma_chain *avc, *pavc;
|
||||||
struct anon_vma *root = NULL;
|
struct anon_vma *root = NULL;
|
||||||
|
struct vm_area_struct *prev = dst->vm_prev, *pprev = src->vm_prev;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If parent share anon_vma with its vm_prev, keep this sharing in in
|
||||||
|
* child.
|
||||||
|
*
|
||||||
|
* 1. Parent has vm_prev, which implies we have vm_prev.
|
||||||
|
* 2. Parent and its vm_prev have the same anon_vma.
|
||||||
|
*/
|
||||||
|
if (!dst->anon_vma && src->anon_vma &&
|
||||||
|
pprev && pprev->anon_vma == src->anon_vma)
|
||||||
|
dst->anon_vma = prev->anon_vma;
|
||||||
|
|
||||||
|
|
||||||
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
|
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
|
||||||
struct anon_vma *anon_vma;
|
struct anon_vma *anon_vma;
|
||||||
|
@ -287,8 +306,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
|
||||||
* will always reuse it. Root anon_vma is never reused:
|
* will always reuse it. Root anon_vma is never reused:
|
||||||
* it has self-parent reference and at least one child.
|
* it has self-parent reference and at least one child.
|
||||||
*/
|
*/
|
||||||
if (!dst->anon_vma && anon_vma != src->anon_vma &&
|
if (!dst->anon_vma && src->anon_vma &&
|
||||||
anon_vma->degree < 2)
|
anon_vma != src->anon_vma && anon_vma->degree < 2)
|
||||||
dst->anon_vma = anon_vma;
|
dst->anon_vma = anon_vma;
|
||||||
}
|
}
|
||||||
if (dst->anon_vma)
|
if (dst->anon_vma)
|
||||||
|
@ -458,9 +477,10 @@ void __init anon_vma_init(void)
|
||||||
* chain and verify that the page in question is indeed mapped in it
|
* chain and verify that the page in question is indeed mapped in it
|
||||||
* [ something equivalent to page_mapped_in_vma() ].
|
* [ something equivalent to page_mapped_in_vma() ].
|
||||||
*
|
*
|
||||||
* Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
|
* Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
|
||||||
* that the anon_vma pointer from page->mapping is valid if there is a
|
* page_remove_rmap() that the anon_vma pointer from page->mapping is valid
|
||||||
* mapcount, we can dereference the anon_vma after observing those.
|
* if there is a mapcount, we can dereference the anon_vma after observing
|
||||||
|
* those.
|
||||||
*/
|
*/
|
||||||
struct anon_vma *page_get_anon_vma(struct page *page)
|
struct anon_vma *page_get_anon_vma(struct page *page)
|
||||||
{
|
{
|
||||||
|
@ -1055,7 +1075,6 @@ static void __page_set_anon_rmap(struct page *page,
|
||||||
static void __page_check_anon_rmap(struct page *page,
|
static void __page_check_anon_rmap(struct page *page,
|
||||||
struct vm_area_struct *vma, unsigned long address)
|
struct vm_area_struct *vma, unsigned long address)
|
||||||
{
|
{
|
||||||
#ifdef CONFIG_DEBUG_VM
|
|
||||||
/*
|
/*
|
||||||
* The page's anon-rmap details (mapping and index) are guaranteed to
|
* The page's anon-rmap details (mapping and index) are guaranteed to
|
||||||
* be set up correctly at this point.
|
* be set up correctly at this point.
|
||||||
|
@ -1068,9 +1087,9 @@ static void __page_check_anon_rmap(struct page *page,
|
||||||
* are initially only visible via the pagetables, and the pte is locked
|
* are initially only visible via the pagetables, and the pte is locked
|
||||||
* over the call to page_add_new_anon_rmap.
|
* over the call to page_add_new_anon_rmap.
|
||||||
*/
|
*/
|
||||||
BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
|
VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page);
|
||||||
BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
|
VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
|
||||||
#endif
|
page);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1273,12 +1292,20 @@ static void page_remove_anon_compound_rmap(struct page *page)
|
||||||
if (TestClearPageDoubleMap(page)) {
|
if (TestClearPageDoubleMap(page)) {
|
||||||
/*
|
/*
|
||||||
* Subpages can be mapped with PTEs too. Check how many of
|
* Subpages can be mapped with PTEs too. Check how many of
|
||||||
* themi are still mapped.
|
* them are still mapped.
|
||||||
*/
|
*/
|
||||||
for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
|
for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
|
||||||
if (atomic_add_negative(-1, &page[i]._mapcount))
|
if (atomic_add_negative(-1, &page[i]._mapcount))
|
||||||
nr++;
|
nr++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Queue the page for deferred split if at least one small
|
||||||
|
* page of the compound page is unmapped, but at least one
|
||||||
|
* small page is still mapped.
|
||||||
|
*/
|
||||||
|
if (nr && nr < HPAGE_PMD_NR)
|
||||||
|
deferred_split_huge_page(page);
|
||||||
} else {
|
} else {
|
||||||
nr = HPAGE_PMD_NR;
|
nr = HPAGE_PMD_NR;
|
||||||
}
|
}
|
||||||
|
@ -1286,10 +1313,8 @@ static void page_remove_anon_compound_rmap(struct page *page)
|
||||||
if (unlikely(PageMlocked(page)))
|
if (unlikely(PageMlocked(page)))
|
||||||
clear_page_mlock(page);
|
clear_page_mlock(page);
|
||||||
|
|
||||||
if (nr) {
|
if (nr)
|
||||||
__mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr);
|
__mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr);
|
||||||
deferred_split_huge_page(page);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
29
mm/shmem.c
29
mm/shmem.c
|
@ -1369,7 +1369,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
|
||||||
if (list_empty(&info->swaplist))
|
if (list_empty(&info->swaplist))
|
||||||
list_add(&info->swaplist, &shmem_swaplist);
|
list_add(&info->swaplist, &shmem_swaplist);
|
||||||
|
|
||||||
if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
|
if (add_to_swap_cache(page, swap,
|
||||||
|
__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN) == 0) {
|
||||||
spin_lock_irq(&info->lock);
|
spin_lock_irq(&info->lock);
|
||||||
shmem_recalc_inode(inode);
|
shmem_recalc_inode(inode);
|
||||||
info->swapped++;
|
info->swapped++;
|
||||||
|
@ -2022,16 +2023,14 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
|
||||||
shmem_falloc->waitq &&
|
shmem_falloc->waitq &&
|
||||||
vmf->pgoff >= shmem_falloc->start &&
|
vmf->pgoff >= shmem_falloc->start &&
|
||||||
vmf->pgoff < shmem_falloc->next) {
|
vmf->pgoff < shmem_falloc->next) {
|
||||||
|
struct file *fpin;
|
||||||
wait_queue_head_t *shmem_falloc_waitq;
|
wait_queue_head_t *shmem_falloc_waitq;
|
||||||
DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
|
DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
|
||||||
|
|
||||||
ret = VM_FAULT_NOPAGE;
|
ret = VM_FAULT_NOPAGE;
|
||||||
if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
|
fpin = maybe_unlock_mmap_for_io(vmf, NULL);
|
||||||
!(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
|
if (fpin)
|
||||||
/* It's polite to up mmap_sem if we can */
|
|
||||||
up_read(&vma->vm_mm->mmap_sem);
|
|
||||||
ret = VM_FAULT_RETRY;
|
ret = VM_FAULT_RETRY;
|
||||||
}
|
|
||||||
|
|
||||||
shmem_falloc_waitq = shmem_falloc->waitq;
|
shmem_falloc_waitq = shmem_falloc->waitq;
|
||||||
prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
|
prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
|
||||||
|
@ -2049,6 +2048,9 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
|
||||||
spin_lock(&inode->i_lock);
|
spin_lock(&inode->i_lock);
|
||||||
finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
|
finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
|
||||||
spin_unlock(&inode->i_lock);
|
spin_unlock(&inode->i_lock);
|
||||||
|
|
||||||
|
if (fpin)
|
||||||
|
fput(fpin);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
spin_unlock(&inode->i_lock);
|
spin_unlock(&inode->i_lock);
|
||||||
|
@ -2213,11 +2215,14 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
|
||||||
return -EPERM;
|
return -EPERM;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED
|
* Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
|
||||||
* read-only mapping, take care to not allow mprotect to revert
|
* MAP_SHARED and read-only, take care to not allow mprotect to
|
||||||
* protections.
|
* revert protections on such mappings. Do this only for shared
|
||||||
|
* mappings. For private mappings, don't need to mask
|
||||||
|
* VM_MAYWRITE as we still want them to be COW-writable.
|
||||||
*/
|
*/
|
||||||
vma->vm_flags &= ~(VM_MAYWRITE);
|
if (vma->vm_flags & VM_SHARED)
|
||||||
|
vma->vm_flags &= ~(VM_MAYWRITE);
|
||||||
}
|
}
|
||||||
|
|
||||||
file_accessed(file);
|
file_accessed(file);
|
||||||
|
@ -2742,7 +2747,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
|
||||||
}
|
}
|
||||||
|
|
||||||
shmem_falloc.waitq = &shmem_falloc_waitq;
|
shmem_falloc.waitq = &shmem_falloc_waitq;
|
||||||
shmem_falloc.start = unmap_start >> PAGE_SHIFT;
|
shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
|
||||||
shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
|
shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
|
||||||
spin_lock(&inode->i_lock);
|
spin_lock(&inode->i_lock);
|
||||||
inode->i_private = &shmem_falloc;
|
inode->i_private = &shmem_falloc;
|
||||||
|
@ -3928,7 +3933,7 @@ out2:
|
||||||
static ssize_t shmem_enabled_show(struct kobject *kobj,
|
static ssize_t shmem_enabled_show(struct kobject *kobj,
|
||||||
struct kobj_attribute *attr, char *buf)
|
struct kobj_attribute *attr, char *buf)
|
||||||
{
|
{
|
||||||
int values[] = {
|
static const int values[] = {
|
||||||
SHMEM_HUGE_ALWAYS,
|
SHMEM_HUGE_ALWAYS,
|
||||||
SHMEM_HUGE_WITHIN_SIZE,
|
SHMEM_HUGE_WITHIN_SIZE,
|
||||||
SHMEM_HUGE_ADVISE,
|
SHMEM_HUGE_ADVISE,
|
||||||
|
|
|
@ -1247,9 +1247,10 @@ void __init kmem_cache_init(void)
|
||||||
* structures first. Without this, further allocations will bug.
|
* structures first. Without this, further allocations will bug.
|
||||||
*/
|
*/
|
||||||
kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache(
|
kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache(
|
||||||
kmalloc_info[INDEX_NODE].name,
|
kmalloc_info[INDEX_NODE].name[KMALLOC_NORMAL],
|
||||||
kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS,
|
kmalloc_info[INDEX_NODE].size,
|
||||||
0, kmalloc_size(INDEX_NODE));
|
ARCH_KMALLOC_FLAGS, 0,
|
||||||
|
kmalloc_info[INDEX_NODE].size);
|
||||||
slab_state = PARTIAL_NODE;
|
slab_state = PARTIAL_NODE;
|
||||||
setup_kmalloc_cache_index_table();
|
setup_kmalloc_cache_index_table();
|
||||||
|
|
||||||
|
|
|
@ -139,7 +139,7 @@ extern struct kmem_cache *kmem_cache;
|
||||||
|
|
||||||
/* A table of kmalloc cache names and sizes */
|
/* A table of kmalloc cache names and sizes */
|
||||||
extern const struct kmalloc_info_struct {
|
extern const struct kmalloc_info_struct {
|
||||||
const char *name;
|
const char *name[NR_KMALLOC_TYPES];
|
||||||
unsigned int size;
|
unsigned int size;
|
||||||
} kmalloc_info[];
|
} kmalloc_info[];
|
||||||
|
|
||||||
|
@ -369,7 +369,7 @@ static __always_inline int memcg_charge_slab(struct page *page,
|
||||||
if (ret)
|
if (ret)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
|
lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
|
||||||
mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order);
|
mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order);
|
||||||
|
|
||||||
/* transer try_charge() page references to kmem_cache */
|
/* transer try_charge() page references to kmem_cache */
|
||||||
|
@ -393,7 +393,7 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
memcg = READ_ONCE(s->memcg_params.memcg);
|
memcg = READ_ONCE(s->memcg_params.memcg);
|
||||||
if (likely(!mem_cgroup_is_root(memcg))) {
|
if (likely(!mem_cgroup_is_root(memcg))) {
|
||||||
lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
|
lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
|
||||||
mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order));
|
mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order));
|
||||||
memcg_kmem_uncharge_memcg(page, order, memcg);
|
memcg_kmem_uncharge_memcg(page, order, memcg);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -1139,26 +1139,56 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
|
||||||
return kmalloc_caches[kmalloc_type(flags)][index];
|
return kmalloc_caches[kmalloc_type(flags)][index];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_ZONE_DMA
|
||||||
|
#define INIT_KMALLOC_INFO(__size, __short_size) \
|
||||||
|
{ \
|
||||||
|
.name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
|
||||||
|
.name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \
|
||||||
|
.name[KMALLOC_DMA] = "dma-kmalloc-" #__short_size, \
|
||||||
|
.size = __size, \
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#define INIT_KMALLOC_INFO(__size, __short_size) \
|
||||||
|
{ \
|
||||||
|
.name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
|
||||||
|
.name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \
|
||||||
|
.size = __size, \
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
|
* kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
|
||||||
* kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
|
* kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
|
||||||
* kmalloc-67108864.
|
* kmalloc-67108864.
|
||||||
*/
|
*/
|
||||||
const struct kmalloc_info_struct kmalloc_info[] __initconst = {
|
const struct kmalloc_info_struct kmalloc_info[] __initconst = {
|
||||||
{NULL, 0}, {"kmalloc-96", 96},
|
INIT_KMALLOC_INFO(0, 0),
|
||||||
{"kmalloc-192", 192}, {"kmalloc-8", 8},
|
INIT_KMALLOC_INFO(96, 96),
|
||||||
{"kmalloc-16", 16}, {"kmalloc-32", 32},
|
INIT_KMALLOC_INFO(192, 192),
|
||||||
{"kmalloc-64", 64}, {"kmalloc-128", 128},
|
INIT_KMALLOC_INFO(8, 8),
|
||||||
{"kmalloc-256", 256}, {"kmalloc-512", 512},
|
INIT_KMALLOC_INFO(16, 16),
|
||||||
{"kmalloc-1k", 1024}, {"kmalloc-2k", 2048},
|
INIT_KMALLOC_INFO(32, 32),
|
||||||
{"kmalloc-4k", 4096}, {"kmalloc-8k", 8192},
|
INIT_KMALLOC_INFO(64, 64),
|
||||||
{"kmalloc-16k", 16384}, {"kmalloc-32k", 32768},
|
INIT_KMALLOC_INFO(128, 128),
|
||||||
{"kmalloc-64k", 65536}, {"kmalloc-128k", 131072},
|
INIT_KMALLOC_INFO(256, 256),
|
||||||
{"kmalloc-256k", 262144}, {"kmalloc-512k", 524288},
|
INIT_KMALLOC_INFO(512, 512),
|
||||||
{"kmalloc-1M", 1048576}, {"kmalloc-2M", 2097152},
|
INIT_KMALLOC_INFO(1024, 1k),
|
||||||
{"kmalloc-4M", 4194304}, {"kmalloc-8M", 8388608},
|
INIT_KMALLOC_INFO(2048, 2k),
|
||||||
{"kmalloc-16M", 16777216}, {"kmalloc-32M", 33554432},
|
INIT_KMALLOC_INFO(4096, 4k),
|
||||||
{"kmalloc-64M", 67108864}
|
INIT_KMALLOC_INFO(8192, 8k),
|
||||||
|
INIT_KMALLOC_INFO(16384, 16k),
|
||||||
|
INIT_KMALLOC_INFO(32768, 32k),
|
||||||
|
INIT_KMALLOC_INFO(65536, 64k),
|
||||||
|
INIT_KMALLOC_INFO(131072, 128k),
|
||||||
|
INIT_KMALLOC_INFO(262144, 256k),
|
||||||
|
INIT_KMALLOC_INFO(524288, 512k),
|
||||||
|
INIT_KMALLOC_INFO(1048576, 1M),
|
||||||
|
INIT_KMALLOC_INFO(2097152, 2M),
|
||||||
|
INIT_KMALLOC_INFO(4194304, 4M),
|
||||||
|
INIT_KMALLOC_INFO(8388608, 8M),
|
||||||
|
INIT_KMALLOC_INFO(16777216, 16M),
|
||||||
|
INIT_KMALLOC_INFO(33554432, 32M),
|
||||||
|
INIT_KMALLOC_INFO(67108864, 64M)
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1208,36 +1238,14 @@ void __init setup_kmalloc_cache_index_table(void)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char *
|
|
||||||
kmalloc_cache_name(const char *prefix, unsigned int size)
|
|
||||||
{
|
|
||||||
|
|
||||||
static const char units[3] = "\0kM";
|
|
||||||
int idx = 0;
|
|
||||||
|
|
||||||
while (size >= 1024 && (size % 1024 == 0)) {
|
|
||||||
size /= 1024;
|
|
||||||
idx++;
|
|
||||||
}
|
|
||||||
|
|
||||||
return kasprintf(GFP_NOWAIT, "%s-%u%c", prefix, size, units[idx]);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void __init
|
static void __init
|
||||||
new_kmalloc_cache(int idx, int type, slab_flags_t flags)
|
new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
|
||||||
{
|
{
|
||||||
const char *name;
|
if (type == KMALLOC_RECLAIM)
|
||||||
|
|
||||||
if (type == KMALLOC_RECLAIM) {
|
|
||||||
flags |= SLAB_RECLAIM_ACCOUNT;
|
flags |= SLAB_RECLAIM_ACCOUNT;
|
||||||
name = kmalloc_cache_name("kmalloc-rcl",
|
|
||||||
kmalloc_info[idx].size);
|
|
||||||
BUG_ON(!name);
|
|
||||||
} else {
|
|
||||||
name = kmalloc_info[idx].name;
|
|
||||||
}
|
|
||||||
|
|
||||||
kmalloc_caches[type][idx] = create_kmalloc_cache(name,
|
kmalloc_caches[type][idx] = create_kmalloc_cache(
|
||||||
|
kmalloc_info[idx].name[type],
|
||||||
kmalloc_info[idx].size, flags, 0,
|
kmalloc_info[idx].size, flags, 0,
|
||||||
kmalloc_info[idx].size);
|
kmalloc_info[idx].size);
|
||||||
}
|
}
|
||||||
|
@ -1249,7 +1257,8 @@ new_kmalloc_cache(int idx, int type, slab_flags_t flags)
|
||||||
*/
|
*/
|
||||||
void __init create_kmalloc_caches(slab_flags_t flags)
|
void __init create_kmalloc_caches(slab_flags_t flags)
|
||||||
{
|
{
|
||||||
int i, type;
|
int i;
|
||||||
|
enum kmalloc_cache_type type;
|
||||||
|
|
||||||
for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
|
for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
|
||||||
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
|
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
|
||||||
|
@ -1278,12 +1287,10 @@ void __init create_kmalloc_caches(slab_flags_t flags)
|
||||||
struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i];
|
struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i];
|
||||||
|
|
||||||
if (s) {
|
if (s) {
|
||||||
unsigned int size = kmalloc_size(i);
|
|
||||||
const char *n = kmalloc_cache_name("dma-kmalloc", size);
|
|
||||||
|
|
||||||
BUG_ON(!n);
|
|
||||||
kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
|
kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
|
||||||
n, size, SLAB_CACHE_DMA | flags, 0, 0);
|
kmalloc_info[i].name[KMALLOC_DMA],
|
||||||
|
kmalloc_info[i].size,
|
||||||
|
SLAB_CACHE_DMA | flags, 0, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
36
mm/slub.c
36
mm/slub.c
|
@ -93,9 +93,7 @@
|
||||||
* minimal so we rely on the page allocators per cpu caches for
|
* minimal so we rely on the page allocators per cpu caches for
|
||||||
* fast frees and allocs.
|
* fast frees and allocs.
|
||||||
*
|
*
|
||||||
* Overloading of page flags that are otherwise used for LRU management.
|
* page->frozen The slab is frozen and exempt from list processing.
|
||||||
*
|
|
||||||
* PageActive The slab is frozen and exempt from list processing.
|
|
||||||
* This means that the slab is dedicated to a purpose
|
* This means that the slab is dedicated to a purpose
|
||||||
* such as satisfying allocations for a specific
|
* such as satisfying allocations for a specific
|
||||||
* processor. Objects may be freed in the slab while
|
* processor. Objects may be freed in the slab while
|
||||||
|
@ -111,7 +109,7 @@
|
||||||
* free objects in addition to the regular freelist
|
* free objects in addition to the regular freelist
|
||||||
* that requires the slab lock.
|
* that requires the slab lock.
|
||||||
*
|
*
|
||||||
* PageError Slab requires special handling due to debug
|
* SLAB_DEBUG_FLAGS Slab requires special handling due to debug
|
||||||
* options set. This moves slab handling out of
|
* options set. This moves slab handling out of
|
||||||
* the fast path and disables lockless freelists.
|
* the fast path and disables lockless freelists.
|
||||||
*/
|
*/
|
||||||
|
@ -736,6 +734,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
|
||||||
{
|
{
|
||||||
u8 *fault;
|
u8 *fault;
|
||||||
u8 *end;
|
u8 *end;
|
||||||
|
u8 *addr = page_address(page);
|
||||||
|
|
||||||
metadata_access_enable();
|
metadata_access_enable();
|
||||||
fault = memchr_inv(start, value, bytes);
|
fault = memchr_inv(start, value, bytes);
|
||||||
|
@ -748,8 +747,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
|
||||||
end--;
|
end--;
|
||||||
|
|
||||||
slab_bug(s, "%s overwritten", what);
|
slab_bug(s, "%s overwritten", what);
|
||||||
pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
|
pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
|
||||||
fault, end - 1, fault[0], value);
|
fault, end - 1, fault - addr,
|
||||||
|
fault[0], value);
|
||||||
print_trailer(s, page, object);
|
print_trailer(s, page, object);
|
||||||
|
|
||||||
restore_bytes(s, what, value, fault, end);
|
restore_bytes(s, what, value, fault, end);
|
||||||
|
@ -844,7 +844,8 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
|
||||||
while (end > fault && end[-1] == POISON_INUSE)
|
while (end > fault && end[-1] == POISON_INUSE)
|
||||||
end--;
|
end--;
|
||||||
|
|
||||||
slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
|
slab_err(s, page, "Padding overwritten. 0x%p-0x%p @offset=%tu",
|
||||||
|
fault, end - 1, fault - start);
|
||||||
print_section(KERN_ERR, "Padding ", pad, remainder);
|
print_section(KERN_ERR, "Padding ", pad, remainder);
|
||||||
|
|
||||||
restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
|
restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
|
||||||
|
@ -4383,31 +4384,26 @@ static int count_total(struct page *page)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_SLUB_DEBUG
|
#ifdef CONFIG_SLUB_DEBUG
|
||||||
static int validate_slab(struct kmem_cache *s, struct page *page,
|
static void validate_slab(struct kmem_cache *s, struct page *page,
|
||||||
unsigned long *map)
|
unsigned long *map)
|
||||||
{
|
{
|
||||||
void *p;
|
void *p;
|
||||||
void *addr = page_address(page);
|
void *addr = page_address(page);
|
||||||
|
|
||||||
if (!check_slab(s, page) ||
|
if (!check_slab(s, page) || !on_freelist(s, page, NULL))
|
||||||
!on_freelist(s, page, NULL))
|
return;
|
||||||
return 0;
|
|
||||||
|
|
||||||
/* Now we know that a valid freelist exists */
|
/* Now we know that a valid freelist exists */
|
||||||
bitmap_zero(map, page->objects);
|
bitmap_zero(map, page->objects);
|
||||||
|
|
||||||
get_map(s, page, map);
|
get_map(s, page, map);
|
||||||
for_each_object(p, s, addr, page->objects) {
|
for_each_object(p, s, addr, page->objects) {
|
||||||
if (test_bit(slab_index(p, s, addr), map))
|
u8 val = test_bit(slab_index(p, s, addr), map) ?
|
||||||
if (!check_object(s, page, p, SLUB_RED_INACTIVE))
|
SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
for_each_object(p, s, addr, page->objects)
|
if (!check_object(s, page, p, val))
|
||||||
if (!test_bit(slab_index(p, s, addr), map))
|
break;
|
||||||
if (!check_object(s, page, p, SLUB_RED_ACTIVE))
|
}
|
||||||
return 0;
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void validate_slab_slab(struct kmem_cache *s, struct page *page,
|
static void validate_slab_slab(struct kmem_cache *s, struct page *page,
|
||||||
|
|
18
mm/sparse.c
18
mm/sparse.c
|
@ -458,8 +458,7 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
|
||||||
if (map)
|
if (map)
|
||||||
return map;
|
return map;
|
||||||
|
|
||||||
map = memblock_alloc_try_nid(size,
|
map = memblock_alloc_try_nid_raw(size, size, addr,
|
||||||
PAGE_SIZE, addr,
|
|
||||||
MEMBLOCK_ALLOC_ACCESSIBLE, nid);
|
MEMBLOCK_ALLOC_ACCESSIBLE, nid);
|
||||||
if (!map)
|
if (!map)
|
||||||
panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
|
panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
|
||||||
|
@ -482,10 +481,13 @@ static void __init sparse_buffer_init(unsigned long size, int nid)
|
||||||
{
|
{
|
||||||
phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
|
phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
|
||||||
WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
|
WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
|
||||||
sparsemap_buf =
|
/*
|
||||||
memblock_alloc_try_nid_raw(size, PAGE_SIZE,
|
* Pre-allocated buffer is mainly used by __populate_section_memmap
|
||||||
addr,
|
* and we want it to be properly aligned to the section size - this is
|
||||||
MEMBLOCK_ALLOC_ACCESSIBLE, nid);
|
* especially the case for VMEMMAP which maps memmap to PMDs
|
||||||
|
*/
|
||||||
|
sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(),
|
||||||
|
addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
|
||||||
sparsemap_buf_end = sparsemap_buf + size;
|
sparsemap_buf_end = sparsemap_buf + size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -647,7 +649,7 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
||||||
static struct page *populate_section_memmap(unsigned long pfn,
|
static struct page * __meminit populate_section_memmap(unsigned long pfn,
|
||||||
unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
|
unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
|
||||||
{
|
{
|
||||||
return __populate_section_memmap(pfn, nr_pages, nid, altmap);
|
return __populate_section_memmap(pfn, nr_pages, nid, altmap);
|
||||||
|
@ -669,7 +671,7 @@ static void free_map_bootmem(struct page *memmap)
|
||||||
vmemmap_free(start, end, NULL);
|
vmemmap_free(start, end, NULL);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
struct page *populate_section_memmap(unsigned long pfn,
|
struct page * __meminit populate_section_memmap(unsigned long pfn,
|
||||||
unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
|
unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
|
||||||
{
|
{
|
||||||
struct page *page, *ret;
|
struct page *page, *ret;
|
||||||
|
|
29
mm/swap.c
29
mm/swap.c
|
@ -373,9 +373,16 @@ static void __lru_cache_activate_page(struct page *page)
|
||||||
void mark_page_accessed(struct page *page)
|
void mark_page_accessed(struct page *page)
|
||||||
{
|
{
|
||||||
page = compound_head(page);
|
page = compound_head(page);
|
||||||
if (!PageActive(page) && !PageUnevictable(page) &&
|
|
||||||
PageReferenced(page)) {
|
|
||||||
|
|
||||||
|
if (!PageReferenced(page)) {
|
||||||
|
SetPageReferenced(page);
|
||||||
|
} else if (PageUnevictable(page)) {
|
||||||
|
/*
|
||||||
|
* Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
|
||||||
|
* this list is never rotated or maintained, so marking an
|
||||||
|
* evictable page accessed has no effect.
|
||||||
|
*/
|
||||||
|
} else if (!PageActive(page)) {
|
||||||
/*
|
/*
|
||||||
* If the page is on the LRU, queue it for activation via
|
* If the page is on the LRU, queue it for activation via
|
||||||
* activate_page_pvecs. Otherwise, assume the page is on a
|
* activate_page_pvecs. Otherwise, assume the page is on a
|
||||||
|
@ -389,8 +396,6 @@ void mark_page_accessed(struct page *page)
|
||||||
ClearPageReferenced(page);
|
ClearPageReferenced(page);
|
||||||
if (page_is_file_cache(page))
|
if (page_is_file_cache(page))
|
||||||
workingset_activation(page);
|
workingset_activation(page);
|
||||||
} else if (!PageReferenced(page)) {
|
|
||||||
SetPageReferenced(page);
|
|
||||||
}
|
}
|
||||||
if (page_is_idle(page))
|
if (page_is_idle(page))
|
||||||
clear_page_idle(page);
|
clear_page_idle(page);
|
||||||
|
@ -708,9 +713,10 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
|
||||||
*/
|
*/
|
||||||
void lru_add_drain_all(void)
|
void lru_add_drain_all(void)
|
||||||
{
|
{
|
||||||
|
static seqcount_t seqcount = SEQCNT_ZERO(seqcount);
|
||||||
static DEFINE_MUTEX(lock);
|
static DEFINE_MUTEX(lock);
|
||||||
static struct cpumask has_work;
|
static struct cpumask has_work;
|
||||||
int cpu;
|
int cpu, seq;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Make sure nobody triggers this path before mm_percpu_wq is fully
|
* Make sure nobody triggers this path before mm_percpu_wq is fully
|
||||||
|
@ -719,7 +725,19 @@ void lru_add_drain_all(void)
|
||||||
if (WARN_ON(!mm_percpu_wq))
|
if (WARN_ON(!mm_percpu_wq))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
seq = raw_read_seqcount_latch(&seqcount);
|
||||||
|
|
||||||
mutex_lock(&lock);
|
mutex_lock(&lock);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Piggyback on drain started and finished while we waited for lock:
|
||||||
|
* all pages pended at the time of our enter were drained from vectors.
|
||||||
|
*/
|
||||||
|
if (__read_seqcount_retry(&seqcount, seq))
|
||||||
|
goto done;
|
||||||
|
|
||||||
|
raw_write_seqcount_latch(&seqcount);
|
||||||
|
|
||||||
cpumask_clear(&has_work);
|
cpumask_clear(&has_work);
|
||||||
|
|
||||||
for_each_online_cpu(cpu) {
|
for_each_online_cpu(cpu) {
|
||||||
|
@ -740,6 +758,7 @@ void lru_add_drain_all(void)
|
||||||
for_each_cpu(cpu, &has_work)
|
for_each_cpu(cpu, &has_work)
|
||||||
flush_work(&per_cpu(lru_add_drain_work, cpu));
|
flush_work(&per_cpu(lru_add_drain_work, cpu));
|
||||||
|
|
||||||
|
done:
|
||||||
mutex_unlock(&lock);
|
mutex_unlock(&lock);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -2887,6 +2887,13 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
|
||||||
error = set_blocksize(p->bdev, PAGE_SIZE);
|
error = set_blocksize(p->bdev, PAGE_SIZE);
|
||||||
if (error < 0)
|
if (error < 0)
|
||||||
return error;
|
return error;
|
||||||
|
/*
|
||||||
|
* Zoned block devices contain zones that have a sequential
|
||||||
|
* write only restriction. Hence zoned block devices are not
|
||||||
|
* suitable for swapping. Disallow them here.
|
||||||
|
*/
|
||||||
|
if (blk_queue_is_zoned(p->bdev->bd_queue))
|
||||||
|
return -EINVAL;
|
||||||
p->flags |= SWP_BLKDEV;
|
p->flags |= SWP_BLKDEV;
|
||||||
} else if (S_ISREG(inode->i_mode)) {
|
} else if (S_ISREG(inode->i_mode)) {
|
||||||
p->bdev = inode->i_sb->s_bdev;
|
p->bdev = inode->i_sb->s_bdev;
|
||||||
|
|
|
@ -18,6 +18,36 @@
|
||||||
#include <asm/tlbflush.h>
|
#include <asm/tlbflush.h>
|
||||||
#include "internal.h"
|
#include "internal.h"
|
||||||
|
|
||||||
|
static __always_inline
|
||||||
|
struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
|
||||||
|
unsigned long dst_start,
|
||||||
|
unsigned long len)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Make sure that the dst range is both valid and fully within a
|
||||||
|
* single existing vma.
|
||||||
|
*/
|
||||||
|
struct vm_area_struct *dst_vma;
|
||||||
|
|
||||||
|
dst_vma = find_vma(dst_mm, dst_start);
|
||||||
|
if (!dst_vma)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
if (dst_start < dst_vma->vm_start ||
|
||||||
|
dst_start + len > dst_vma->vm_end)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check the vma is registered in uffd, this is required to
|
||||||
|
* enforce the VM_MAYWRITE check done at uffd registration
|
||||||
|
* time.
|
||||||
|
*/
|
||||||
|
if (!dst_vma->vm_userfaultfd_ctx.ctx)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
return dst_vma;
|
||||||
|
}
|
||||||
|
|
||||||
static int mcopy_atomic_pte(struct mm_struct *dst_mm,
|
static int mcopy_atomic_pte(struct mm_struct *dst_mm,
|
||||||
pmd_t *dst_pmd,
|
pmd_t *dst_pmd,
|
||||||
struct vm_area_struct *dst_vma,
|
struct vm_area_struct *dst_vma,
|
||||||
|
@ -60,7 +90,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The memory barrier inside __SetPageUptodate makes sure that
|
* The memory barrier inside __SetPageUptodate makes sure that
|
||||||
* preceeding stores to the page contents become visible before
|
* preceding stores to the page contents become visible before
|
||||||
* the set_pte_at() write.
|
* the set_pte_at() write.
|
||||||
*/
|
*/
|
||||||
__SetPageUptodate(page);
|
__SetPageUptodate(page);
|
||||||
|
@ -184,7 +214,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
|
||||||
unsigned long src_addr, dst_addr;
|
unsigned long src_addr, dst_addr;
|
||||||
long copied;
|
long copied;
|
||||||
struct page *page;
|
struct page *page;
|
||||||
struct hstate *h;
|
|
||||||
unsigned long vma_hpagesize;
|
unsigned long vma_hpagesize;
|
||||||
pgoff_t idx;
|
pgoff_t idx;
|
||||||
u32 hash;
|
u32 hash;
|
||||||
|
@ -221,20 +250,9 @@ retry:
|
||||||
*/
|
*/
|
||||||
if (!dst_vma) {
|
if (!dst_vma) {
|
||||||
err = -ENOENT;
|
err = -ENOENT;
|
||||||
dst_vma = find_vma(dst_mm, dst_start);
|
dst_vma = find_dst_vma(dst_mm, dst_start, len);
|
||||||
if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
|
if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
/*
|
|
||||||
* Check the vma is registered in uffd, this is
|
|
||||||
* required to enforce the VM_MAYWRITE check done at
|
|
||||||
* uffd registration time.
|
|
||||||
*/
|
|
||||||
if (!dst_vma->vm_userfaultfd_ctx.ctx)
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
if (dst_start < dst_vma->vm_start ||
|
|
||||||
dst_start + len > dst_vma->vm_end)
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
err = -EINVAL;
|
err = -EINVAL;
|
||||||
if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
|
if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
|
||||||
|
@ -243,10 +261,6 @@ retry:
|
||||||
vm_shared = dst_vma->vm_flags & VM_SHARED;
|
vm_shared = dst_vma->vm_flags & VM_SHARED;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
|
|
||||||
(len - copied) & (vma_hpagesize - 1)))
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If not shared, ensure the dst_vma has a anon_vma.
|
* If not shared, ensure the dst_vma has a anon_vma.
|
||||||
*/
|
*/
|
||||||
|
@ -256,24 +270,21 @@ retry:
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
h = hstate_vma(dst_vma);
|
|
||||||
|
|
||||||
while (src_addr < src_start + len) {
|
while (src_addr < src_start + len) {
|
||||||
pte_t dst_pteval;
|
pte_t dst_pteval;
|
||||||
|
|
||||||
BUG_ON(dst_addr >= dst_start + len);
|
BUG_ON(dst_addr >= dst_start + len);
|
||||||
VM_BUG_ON(dst_addr & ~huge_page_mask(h));
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Serialize via hugetlb_fault_mutex
|
* Serialize via hugetlb_fault_mutex
|
||||||
*/
|
*/
|
||||||
idx = linear_page_index(dst_vma, dst_addr);
|
idx = linear_page_index(dst_vma, dst_addr);
|
||||||
mapping = dst_vma->vm_file->f_mapping;
|
mapping = dst_vma->vm_file->f_mapping;
|
||||||
hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr);
|
hash = hugetlb_fault_mutex_hash(mapping, idx);
|
||||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
|
||||||
err = -ENOMEM;
|
err = -ENOMEM;
|
||||||
dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
|
dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize);
|
||||||
if (!dst_pte) {
|
if (!dst_pte) {
|
||||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
|
@ -300,7 +311,8 @@ retry:
|
||||||
|
|
||||||
err = copy_huge_page_from_user(page,
|
err = copy_huge_page_from_user(page,
|
||||||
(const void __user *)src_addr,
|
(const void __user *)src_addr,
|
||||||
pages_per_huge_page(h), true);
|
vma_hpagesize / PAGE_SIZE,
|
||||||
|
true);
|
||||||
if (unlikely(err)) {
|
if (unlikely(err)) {
|
||||||
err = -EFAULT;
|
err = -EFAULT;
|
||||||
goto out;
|
goto out;
|
||||||
|
@ -475,20 +487,9 @@ retry:
|
||||||
* both valid and fully within a single existing vma.
|
* both valid and fully within a single existing vma.
|
||||||
*/
|
*/
|
||||||
err = -ENOENT;
|
err = -ENOENT;
|
||||||
dst_vma = find_vma(dst_mm, dst_start);
|
dst_vma = find_dst_vma(dst_mm, dst_start, len);
|
||||||
if (!dst_vma)
|
if (!dst_vma)
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
/*
|
|
||||||
* Check the vma is registered in uffd, this is required to
|
|
||||||
* enforce the VM_MAYWRITE check done at uffd registration
|
|
||||||
* time.
|
|
||||||
*/
|
|
||||||
if (!dst_vma->vm_userfaultfd_ctx.ctx)
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
if (dst_start < dst_vma->vm_start ||
|
|
||||||
dst_start + len > dst_vma->vm_end)
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
err = -EINVAL;
|
err = -EINVAL;
|
||||||
/*
|
/*
|
||||||
|
|
22
mm/util.c
22
mm/util.c
|
@ -271,7 +271,7 @@ void *memdup_user_nul(const void __user *src, size_t len)
|
||||||
EXPORT_SYMBOL(memdup_user_nul);
|
EXPORT_SYMBOL(memdup_user_nul);
|
||||||
|
|
||||||
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
|
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
struct vm_area_struct *prev, struct rb_node *rb_parent)
|
struct vm_area_struct *prev)
|
||||||
{
|
{
|
||||||
struct vm_area_struct *next;
|
struct vm_area_struct *next;
|
||||||
|
|
||||||
|
@ -280,18 +280,28 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
next = prev->vm_next;
|
next = prev->vm_next;
|
||||||
prev->vm_next = vma;
|
prev->vm_next = vma;
|
||||||
} else {
|
} else {
|
||||||
|
next = mm->mmap;
|
||||||
mm->mmap = vma;
|
mm->mmap = vma;
|
||||||
if (rb_parent)
|
|
||||||
next = rb_entry(rb_parent,
|
|
||||||
struct vm_area_struct, vm_rb);
|
|
||||||
else
|
|
||||||
next = NULL;
|
|
||||||
}
|
}
|
||||||
vma->vm_next = next;
|
vma->vm_next = next;
|
||||||
if (next)
|
if (next)
|
||||||
next->vm_prev = vma;
|
next->vm_prev = vma;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma)
|
||||||
|
{
|
||||||
|
struct vm_area_struct *prev, *next;
|
||||||
|
|
||||||
|
next = vma->vm_next;
|
||||||
|
prev = vma->vm_prev;
|
||||||
|
if (prev)
|
||||||
|
prev->vm_next = next;
|
||||||
|
else
|
||||||
|
mm->mmap = next;
|
||||||
|
if (next)
|
||||||
|
next->vm_prev = prev;
|
||||||
|
}
|
||||||
|
|
||||||
/* Check if the vma is being used as a stack by this task */
|
/* Check if the vma is being used as a stack by this task */
|
||||||
int vma_is_stack_for_current(struct vm_area_struct *vma)
|
int vma_is_stack_for_current(struct vm_area_struct *vma)
|
||||||
{
|
{
|
||||||
|
|
192
mm/vmalloc.c
192
mm/vmalloc.c
|
@ -331,6 +331,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
|
||||||
|
|
||||||
|
|
||||||
static DEFINE_SPINLOCK(vmap_area_lock);
|
static DEFINE_SPINLOCK(vmap_area_lock);
|
||||||
|
static DEFINE_SPINLOCK(free_vmap_area_lock);
|
||||||
/* Export for kexec only */
|
/* Export for kexec only */
|
||||||
LIST_HEAD(vmap_area_list);
|
LIST_HEAD(vmap_area_list);
|
||||||
static LLIST_HEAD(vmap_purge_list);
|
static LLIST_HEAD(vmap_purge_list);
|
||||||
|
@ -682,7 +683,7 @@ insert_vmap_area_augment(struct vmap_area *va,
|
||||||
* free area is inserted. If VA has been merged, it is
|
* free area is inserted. If VA has been merged, it is
|
||||||
* freed.
|
* freed.
|
||||||
*/
|
*/
|
||||||
static __always_inline void
|
static __always_inline struct vmap_area *
|
||||||
merge_or_add_vmap_area(struct vmap_area *va,
|
merge_or_add_vmap_area(struct vmap_area *va,
|
||||||
struct rb_root *root, struct list_head *head)
|
struct rb_root *root, struct list_head *head)
|
||||||
{
|
{
|
||||||
|
@ -749,7 +750,10 @@ merge_or_add_vmap_area(struct vmap_area *va,
|
||||||
|
|
||||||
/* Free vmap_area object. */
|
/* Free vmap_area object. */
|
||||||
kmem_cache_free(vmap_area_cachep, va);
|
kmem_cache_free(vmap_area_cachep, va);
|
||||||
return;
|
|
||||||
|
/* Point to the new merged area. */
|
||||||
|
va = sibling;
|
||||||
|
merged = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -758,6 +762,8 @@ insert:
|
||||||
link_va(va, root, parent, link, head);
|
link_va(va, root, parent, link, head);
|
||||||
augment_tree_propagate_from(va);
|
augment_tree_propagate_from(va);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return va;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __always_inline bool
|
static __always_inline bool
|
||||||
|
@ -968,6 +974,19 @@ adjust_va_to_fit_type(struct vmap_area *va,
|
||||||
* There are a few exceptions though, as an example it is
|
* There are a few exceptions though, as an example it is
|
||||||
* a first allocation (early boot up) when we have "one"
|
* a first allocation (early boot up) when we have "one"
|
||||||
* big free space that has to be split.
|
* big free space that has to be split.
|
||||||
|
*
|
||||||
|
* Also we can hit this path in case of regular "vmap"
|
||||||
|
* allocations, if "this" current CPU was not preloaded.
|
||||||
|
* See the comment in alloc_vmap_area() why. If so, then
|
||||||
|
* GFP_NOWAIT is used instead to get an extra object for
|
||||||
|
* split purpose. That is rare and most time does not
|
||||||
|
* occur.
|
||||||
|
*
|
||||||
|
* What happens if an allocation gets failed. Basically,
|
||||||
|
* an "overflow" path is triggered to purge lazily freed
|
||||||
|
* areas to free some memory, then, the "retry" path is
|
||||||
|
* triggered to repeat one more time. See more details
|
||||||
|
* in alloc_vmap_area() function.
|
||||||
*/
|
*/
|
||||||
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
|
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
|
||||||
if (!lva)
|
if (!lva)
|
||||||
|
@ -1063,9 +1082,9 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
|
||||||
return ERR_PTR(-EBUSY);
|
return ERR_PTR(-EBUSY);
|
||||||
|
|
||||||
might_sleep();
|
might_sleep();
|
||||||
|
gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
|
||||||
|
|
||||||
va = kmem_cache_alloc_node(vmap_area_cachep,
|
va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
|
||||||
gfp_mask & GFP_RECLAIM_MASK, node);
|
|
||||||
if (unlikely(!va))
|
if (unlikely(!va))
|
||||||
return ERR_PTR(-ENOMEM);
|
return ERR_PTR(-ENOMEM);
|
||||||
|
|
||||||
|
@ -1073,49 +1092,55 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
|
||||||
* Only scan the relevant parts containing pointers to other objects
|
* Only scan the relevant parts containing pointers to other objects
|
||||||
* to avoid false negatives.
|
* to avoid false negatives.
|
||||||
*/
|
*/
|
||||||
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
|
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
|
||||||
|
|
||||||
retry:
|
retry:
|
||||||
/*
|
/*
|
||||||
* Preload this CPU with one extra vmap_area object to ensure
|
* Preload this CPU with one extra vmap_area object. It is used
|
||||||
* that we have it available when fit type of free area is
|
* when fit type of free area is NE_FIT_TYPE. Please note, it
|
||||||
* NE_FIT_TYPE.
|
* does not guarantee that an allocation occurs on a CPU that
|
||||||
|
* is preloaded, instead we minimize the case when it is not.
|
||||||
|
* It can happen because of cpu migration, because there is a
|
||||||
|
* race until the below spinlock is taken.
|
||||||
*
|
*
|
||||||
* The preload is done in non-atomic context, thus it allows us
|
* The preload is done in non-atomic context, thus it allows us
|
||||||
* to use more permissive allocation masks to be more stable under
|
* to use more permissive allocation masks to be more stable under
|
||||||
* low memory condition and high memory pressure.
|
* low memory condition and high memory pressure. In rare case,
|
||||||
|
* if not preloaded, GFP_NOWAIT is used.
|
||||||
*
|
*
|
||||||
* Even if it fails we do not really care about that. Just proceed
|
* Set "pva" to NULL here, because of "retry" path.
|
||||||
* as it is. "overflow" path will refill the cache we allocate from.
|
|
||||||
*/
|
*/
|
||||||
preempt_disable();
|
pva = NULL;
|
||||||
if (!__this_cpu_read(ne_fit_preload_node)) {
|
|
||||||
preempt_enable();
|
|
||||||
pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node);
|
|
||||||
preempt_disable();
|
|
||||||
|
|
||||||
if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) {
|
if (!this_cpu_read(ne_fit_preload_node))
|
||||||
if (pva)
|
/*
|
||||||
kmem_cache_free(vmap_area_cachep, pva);
|
* Even if it fails we do not really care about that.
|
||||||
}
|
* Just proceed as it is. If needed "overflow" path
|
||||||
}
|
* will refill the cache we allocate from.
|
||||||
|
*/
|
||||||
|
pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
|
||||||
|
|
||||||
spin_lock(&vmap_area_lock);
|
spin_lock(&free_vmap_area_lock);
|
||||||
preempt_enable();
|
|
||||||
|
if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
|
||||||
|
kmem_cache_free(vmap_area_cachep, pva);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If an allocation fails, the "vend" address is
|
* If an allocation fails, the "vend" address is
|
||||||
* returned. Therefore trigger the overflow path.
|
* returned. Therefore trigger the overflow path.
|
||||||
*/
|
*/
|
||||||
addr = __alloc_vmap_area(size, align, vstart, vend);
|
addr = __alloc_vmap_area(size, align, vstart, vend);
|
||||||
|
spin_unlock(&free_vmap_area_lock);
|
||||||
|
|
||||||
if (unlikely(addr == vend))
|
if (unlikely(addr == vend))
|
||||||
goto overflow;
|
goto overflow;
|
||||||
|
|
||||||
va->va_start = addr;
|
va->va_start = addr;
|
||||||
va->va_end = addr + size;
|
va->va_end = addr + size;
|
||||||
va->vm = NULL;
|
va->vm = NULL;
|
||||||
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
|
|
||||||
|
|
||||||
|
spin_lock(&vmap_area_lock);
|
||||||
|
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
|
||||||
spin_unlock(&vmap_area_lock);
|
spin_unlock(&vmap_area_lock);
|
||||||
|
|
||||||
BUG_ON(!IS_ALIGNED(va->va_start, align));
|
BUG_ON(!IS_ALIGNED(va->va_start, align));
|
||||||
|
@ -1125,7 +1150,6 @@ retry:
|
||||||
return va;
|
return va;
|
||||||
|
|
||||||
overflow:
|
overflow:
|
||||||
spin_unlock(&vmap_area_lock);
|
|
||||||
if (!purged) {
|
if (!purged) {
|
||||||
purge_vmap_area_lazy();
|
purge_vmap_area_lazy();
|
||||||
purged = 1;
|
purged = 1;
|
||||||
|
@ -1161,28 +1185,24 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb)
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
|
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
|
||||||
|
|
||||||
static void __free_vmap_area(struct vmap_area *va)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* Remove from the busy tree/list.
|
|
||||||
*/
|
|
||||||
unlink_va(va, &vmap_area_root);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Merge VA with its neighbors, otherwise just add it.
|
|
||||||
*/
|
|
||||||
merge_or_add_vmap_area(va,
|
|
||||||
&free_vmap_area_root, &free_vmap_area_list);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Free a region of KVA allocated by alloc_vmap_area
|
* Free a region of KVA allocated by alloc_vmap_area
|
||||||
*/
|
*/
|
||||||
static void free_vmap_area(struct vmap_area *va)
|
static void free_vmap_area(struct vmap_area *va)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* Remove from the busy tree/list.
|
||||||
|
*/
|
||||||
spin_lock(&vmap_area_lock);
|
spin_lock(&vmap_area_lock);
|
||||||
__free_vmap_area(va);
|
unlink_va(va, &vmap_area_root);
|
||||||
spin_unlock(&vmap_area_lock);
|
spin_unlock(&vmap_area_lock);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Insert/Merge it back to the free tree/list.
|
||||||
|
*/
|
||||||
|
spin_lock(&free_vmap_area_lock);
|
||||||
|
merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
|
||||||
|
spin_unlock(&free_vmap_area_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1275,24 +1295,30 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
|
||||||
flush_tlb_kernel_range(start, end);
|
flush_tlb_kernel_range(start, end);
|
||||||
resched_threshold = lazy_max_pages() << 1;
|
resched_threshold = lazy_max_pages() << 1;
|
||||||
|
|
||||||
spin_lock(&vmap_area_lock);
|
spin_lock(&free_vmap_area_lock);
|
||||||
llist_for_each_entry_safe(va, n_va, valist, purge_list) {
|
llist_for_each_entry_safe(va, n_va, valist, purge_list) {
|
||||||
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
|
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
|
||||||
|
unsigned long orig_start = va->va_start;
|
||||||
|
unsigned long orig_end = va->va_end;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Finally insert or merge lazily-freed area. It is
|
* Finally insert or merge lazily-freed area. It is
|
||||||
* detached and there is no need to "unlink" it from
|
* detached and there is no need to "unlink" it from
|
||||||
* anything.
|
* anything.
|
||||||
*/
|
*/
|
||||||
merge_or_add_vmap_area(va,
|
va = merge_or_add_vmap_area(va, &free_vmap_area_root,
|
||||||
&free_vmap_area_root, &free_vmap_area_list);
|
&free_vmap_area_list);
|
||||||
|
|
||||||
|
if (is_vmalloc_or_module_addr((void *)orig_start))
|
||||||
|
kasan_release_vmalloc(orig_start, orig_end,
|
||||||
|
va->va_start, va->va_end);
|
||||||
|
|
||||||
atomic_long_sub(nr, &vmap_lazy_nr);
|
atomic_long_sub(nr, &vmap_lazy_nr);
|
||||||
|
|
||||||
if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
|
if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
|
||||||
cond_resched_lock(&vmap_area_lock);
|
cond_resched_lock(&free_vmap_area_lock);
|
||||||
}
|
}
|
||||||
spin_unlock(&vmap_area_lock);
|
spin_unlock(&free_vmap_area_lock);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2014,15 +2040,21 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(map_vm_area);
|
EXPORT_SYMBOL_GPL(map_vm_area);
|
||||||
|
|
||||||
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
|
static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
|
||||||
unsigned long flags, const void *caller)
|
struct vmap_area *va, unsigned long flags, const void *caller)
|
||||||
{
|
{
|
||||||
spin_lock(&vmap_area_lock);
|
|
||||||
vm->flags = flags;
|
vm->flags = flags;
|
||||||
vm->addr = (void *)va->va_start;
|
vm->addr = (void *)va->va_start;
|
||||||
vm->size = va->va_end - va->va_start;
|
vm->size = va->va_end - va->va_start;
|
||||||
vm->caller = caller;
|
vm->caller = caller;
|
||||||
va->vm = vm;
|
va->vm = vm;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
|
||||||
|
unsigned long flags, const void *caller)
|
||||||
|
{
|
||||||
|
spin_lock(&vmap_area_lock);
|
||||||
|
setup_vmalloc_vm_locked(vm, va, flags, caller);
|
||||||
spin_unlock(&vmap_area_lock);
|
spin_unlock(&vmap_area_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2068,6 +2100,22 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
|
||||||
|
|
||||||
setup_vmalloc_vm(area, va, flags, caller);
|
setup_vmalloc_vm(area, va, flags, caller);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For KASAN, if we are in vmalloc space, we need to cover the shadow
|
||||||
|
* area with real memory. If we come here through VM_ALLOC, this is
|
||||||
|
* done by a higher level function that has access to the true size,
|
||||||
|
* which might not be a full page.
|
||||||
|
*
|
||||||
|
* We assume module space comes via VM_ALLOC path.
|
||||||
|
*/
|
||||||
|
if (is_vmalloc_addr(area->addr) && !(area->flags & VM_ALLOC)) {
|
||||||
|
if (kasan_populate_vmalloc(area->size, area)) {
|
||||||
|
unmap_vmap_area(va);
|
||||||
|
kfree(area);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return area;
|
return area;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2245,6 +2293,9 @@ static void __vunmap(const void *addr, int deallocate_pages)
|
||||||
debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
|
debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
|
||||||
debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
|
debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
|
||||||
|
|
||||||
|
if (area->flags & VM_KASAN)
|
||||||
|
kasan_poison_vmalloc(area->addr, area->size);
|
||||||
|
|
||||||
vm_remove_mappings(area, deallocate_pages);
|
vm_remove_mappings(area, deallocate_pages);
|
||||||
|
|
||||||
if (deallocate_pages) {
|
if (deallocate_pages) {
|
||||||
|
@ -2440,7 +2491,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
area->pages[i] = page;
|
area->pages[i] = page;
|
||||||
if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
|
if (gfpflags_allow_blocking(gfp_mask))
|
||||||
cond_resched();
|
cond_resched();
|
||||||
}
|
}
|
||||||
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
|
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
|
||||||
|
@ -2497,6 +2548,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
|
||||||
if (!addr)
|
if (!addr)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
if (is_vmalloc_or_module_addr(area->addr)) {
|
||||||
|
if (kasan_populate_vmalloc(real_size, area))
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
|
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
|
||||||
* flag. It means that vm_struct is not fully initialized.
|
* flag. It means that vm_struct is not fully initialized.
|
||||||
|
@ -3282,7 +3338,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
|
||||||
goto err_free;
|
goto err_free;
|
||||||
}
|
}
|
||||||
retry:
|
retry:
|
||||||
spin_lock(&vmap_area_lock);
|
spin_lock(&free_vmap_area_lock);
|
||||||
|
|
||||||
/* start scanning - we scan from the top, begin with the last area */
|
/* start scanning - we scan from the top, begin with the last area */
|
||||||
area = term_area = last_area;
|
area = term_area = last_area;
|
||||||
|
@ -3364,29 +3420,44 @@ retry:
|
||||||
va = vas[area];
|
va = vas[area];
|
||||||
va->va_start = start;
|
va->va_start = start;
|
||||||
va->va_end = start + size;
|
va->va_end = start + size;
|
||||||
|
|
||||||
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
spin_unlock(&vmap_area_lock);
|
spin_unlock(&free_vmap_area_lock);
|
||||||
|
|
||||||
/* insert all vm's */
|
/* insert all vm's */
|
||||||
for (area = 0; area < nr_vms; area++)
|
spin_lock(&vmap_area_lock);
|
||||||
setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
|
for (area = 0; area < nr_vms; area++) {
|
||||||
|
insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
|
||||||
|
|
||||||
|
setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
|
||||||
pcpu_get_vm_areas);
|
pcpu_get_vm_areas);
|
||||||
|
}
|
||||||
|
spin_unlock(&vmap_area_lock);
|
||||||
|
|
||||||
|
/* populate the shadow space outside of the lock */
|
||||||
|
for (area = 0; area < nr_vms; area++) {
|
||||||
|
/* assume success here */
|
||||||
|
kasan_populate_vmalloc(sizes[area], vms[area]);
|
||||||
|
}
|
||||||
|
|
||||||
kfree(vas);
|
kfree(vas);
|
||||||
return vms;
|
return vms;
|
||||||
|
|
||||||
recovery:
|
recovery:
|
||||||
/* Remove previously inserted areas. */
|
/*
|
||||||
|
* Remove previously allocated areas. There is no
|
||||||
|
* need in removing these areas from the busy tree,
|
||||||
|
* because they are inserted only on the final step
|
||||||
|
* and when pcpu_get_vm_areas() is success.
|
||||||
|
*/
|
||||||
while (area--) {
|
while (area--) {
|
||||||
__free_vmap_area(vas[area]);
|
merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
|
||||||
|
&free_vmap_area_list);
|
||||||
vas[area] = NULL;
|
vas[area] = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
overflow:
|
overflow:
|
||||||
spin_unlock(&vmap_area_lock);
|
spin_unlock(&free_vmap_area_lock);
|
||||||
if (!purged) {
|
if (!purged) {
|
||||||
purge_vmap_area_lazy();
|
purge_vmap_area_lazy();
|
||||||
purged = true;
|
purged = true;
|
||||||
|
@ -3437,9 +3508,12 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
|
||||||
|
|
||||||
#ifdef CONFIG_PROC_FS
|
#ifdef CONFIG_PROC_FS
|
||||||
static void *s_start(struct seq_file *m, loff_t *pos)
|
static void *s_start(struct seq_file *m, loff_t *pos)
|
||||||
|
__acquires(&vmap_purge_lock)
|
||||||
__acquires(&vmap_area_lock)
|
__acquires(&vmap_area_lock)
|
||||||
{
|
{
|
||||||
|
mutex_lock(&vmap_purge_lock);
|
||||||
spin_lock(&vmap_area_lock);
|
spin_lock(&vmap_area_lock);
|
||||||
|
|
||||||
return seq_list_start(&vmap_area_list, *pos);
|
return seq_list_start(&vmap_area_list, *pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3449,8 +3523,10 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
|
||||||
}
|
}
|
||||||
|
|
||||||
static void s_stop(struct seq_file *m, void *p)
|
static void s_stop(struct seq_file *m, void *p)
|
||||||
|
__releases(&vmap_purge_lock)
|
||||||
__releases(&vmap_area_lock)
|
__releases(&vmap_area_lock)
|
||||||
{
|
{
|
||||||
|
mutex_unlock(&vmap_purge_lock);
|
||||||
spin_unlock(&vmap_area_lock);
|
spin_unlock(&vmap_area_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
686
mm/vmscan.c
686
mm/vmscan.c
File diff suppressed because it is too large
Load diff
|
@ -213,28 +213,53 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
|
||||||
*workingsetp = workingset;
|
*workingsetp = workingset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Reclaiming a cgroup means reclaiming all its children in a
|
||||||
|
* round-robin fashion. That means that each cgroup has an LRU
|
||||||
|
* order that is composed of the LRU orders of its child
|
||||||
|
* cgroups; and every page has an LRU position not just in the
|
||||||
|
* cgroup that owns it, but in all of that group's ancestors.
|
||||||
|
*
|
||||||
|
* So when the physical inactive list of a leaf cgroup ages,
|
||||||
|
* the virtual inactive lists of all its parents, including
|
||||||
|
* the root cgroup's, age as well.
|
||||||
|
*/
|
||||||
|
do {
|
||||||
|
struct lruvec *lruvec;
|
||||||
|
|
||||||
|
lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||||
|
atomic_long_inc(&lruvec->inactive_age);
|
||||||
|
} while (memcg && (memcg = parent_mem_cgroup(memcg)));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* workingset_eviction - note the eviction of a page from memory
|
* workingset_eviction - note the eviction of a page from memory
|
||||||
|
* @target_memcg: the cgroup that is causing the reclaim
|
||||||
* @page: the page being evicted
|
* @page: the page being evicted
|
||||||
*
|
*
|
||||||
* Returns a shadow entry to be stored in @page->mapping->i_pages in place
|
* Returns a shadow entry to be stored in @page->mapping->i_pages in place
|
||||||
* of the evicted @page so that a later refault can be detected.
|
* of the evicted @page so that a later refault can be detected.
|
||||||
*/
|
*/
|
||||||
void *workingset_eviction(struct page *page)
|
void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
|
||||||
{
|
{
|
||||||
struct pglist_data *pgdat = page_pgdat(page);
|
struct pglist_data *pgdat = page_pgdat(page);
|
||||||
struct mem_cgroup *memcg = page_memcg(page);
|
|
||||||
int memcgid = mem_cgroup_id(memcg);
|
|
||||||
unsigned long eviction;
|
unsigned long eviction;
|
||||||
struct lruvec *lruvec;
|
struct lruvec *lruvec;
|
||||||
|
int memcgid;
|
||||||
|
|
||||||
/* Page is fully exclusive and pins page->mem_cgroup */
|
/* Page is fully exclusive and pins page->mem_cgroup */
|
||||||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||||
VM_BUG_ON_PAGE(page_count(page), page);
|
VM_BUG_ON_PAGE(page_count(page), page);
|
||||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||||
|
|
||||||
lruvec = mem_cgroup_lruvec(pgdat, memcg);
|
advance_inactive_age(page_memcg(page), pgdat);
|
||||||
eviction = atomic_long_inc_return(&lruvec->inactive_age);
|
|
||||||
|
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
|
||||||
|
/* XXX: target_memcg can be NULL, go through lruvec */
|
||||||
|
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
|
||||||
|
eviction = atomic_long_read(&lruvec->inactive_age);
|
||||||
return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
|
return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -244,10 +269,13 @@ void *workingset_eviction(struct page *page)
|
||||||
* @shadow: shadow entry of the evicted page
|
* @shadow: shadow entry of the evicted page
|
||||||
*
|
*
|
||||||
* Calculates and evaluates the refault distance of the previously
|
* Calculates and evaluates the refault distance of the previously
|
||||||
* evicted page in the context of the node it was allocated in.
|
* evicted page in the context of the node and the memcg whose memory
|
||||||
|
* pressure caused the eviction.
|
||||||
*/
|
*/
|
||||||
void workingset_refault(struct page *page, void *shadow)
|
void workingset_refault(struct page *page, void *shadow)
|
||||||
{
|
{
|
||||||
|
struct mem_cgroup *eviction_memcg;
|
||||||
|
struct lruvec *eviction_lruvec;
|
||||||
unsigned long refault_distance;
|
unsigned long refault_distance;
|
||||||
struct pglist_data *pgdat;
|
struct pglist_data *pgdat;
|
||||||
unsigned long active_file;
|
unsigned long active_file;
|
||||||
|
@ -277,12 +305,12 @@ void workingset_refault(struct page *page, void *shadow)
|
||||||
* would be better if the root_mem_cgroup existed in all
|
* would be better if the root_mem_cgroup existed in all
|
||||||
* configurations instead.
|
* configurations instead.
|
||||||
*/
|
*/
|
||||||
memcg = mem_cgroup_from_id(memcgid);
|
eviction_memcg = mem_cgroup_from_id(memcgid);
|
||||||
if (!mem_cgroup_disabled() && !memcg)
|
if (!mem_cgroup_disabled() && !eviction_memcg)
|
||||||
goto out;
|
goto out;
|
||||||
lruvec = mem_cgroup_lruvec(pgdat, memcg);
|
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
|
||||||
refault = atomic_long_read(&lruvec->inactive_age);
|
refault = atomic_long_read(&eviction_lruvec->inactive_age);
|
||||||
active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
|
active_file = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Calculate the refault distance
|
* Calculate the refault distance
|
||||||
|
@ -302,6 +330,17 @@ void workingset_refault(struct page *page, void *shadow)
|
||||||
*/
|
*/
|
||||||
refault_distance = (refault - eviction) & EVICTION_MASK;
|
refault_distance = (refault - eviction) & EVICTION_MASK;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The activation decision for this page is made at the level
|
||||||
|
* where the eviction occurred, as that is where the LRU order
|
||||||
|
* during page reclaim is being determined.
|
||||||
|
*
|
||||||
|
* However, the cgroup that will own the page is the one that
|
||||||
|
* is actually experiencing the refault event.
|
||||||
|
*/
|
||||||
|
memcg = page_memcg(page);
|
||||||
|
lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||||
|
|
||||||
inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
|
inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -313,7 +352,7 @@ void workingset_refault(struct page *page, void *shadow)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
SetPageActive(page);
|
SetPageActive(page);
|
||||||
atomic_long_inc(&lruvec->inactive_age);
|
advance_inactive_age(memcg, pgdat);
|
||||||
inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
|
inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
|
||||||
|
|
||||||
/* Page was active prior to eviction */
|
/* Page was active prior to eviction */
|
||||||
|
@ -332,7 +371,6 @@ out:
|
||||||
void workingset_activation(struct page *page)
|
void workingset_activation(struct page *page)
|
||||||
{
|
{
|
||||||
struct mem_cgroup *memcg;
|
struct mem_cgroup *memcg;
|
||||||
struct lruvec *lruvec;
|
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
/*
|
/*
|
||||||
|
@ -345,8 +383,7 @@ void workingset_activation(struct page *page)
|
||||||
memcg = page_memcg_rcu(page);
|
memcg = page_memcg_rcu(page);
|
||||||
if (!mem_cgroup_disabled() && !memcg)
|
if (!mem_cgroup_disabled() && !memcg)
|
||||||
goto out;
|
goto out;
|
||||||
lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
|
advance_inactive_age(memcg, page_pgdat(page));
|
||||||
atomic_long_inc(&lruvec->inactive_age);
|
|
||||||
out:
|
out:
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
}
|
}
|
||||||
|
@ -426,7 +463,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
|
||||||
struct lruvec *lruvec;
|
struct lruvec *lruvec;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg);
|
lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
|
||||||
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
|
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
|
||||||
pages += lruvec_page_state_local(lruvec,
|
pages += lruvec_page_state_local(lruvec,
|
||||||
NR_LRU_BASE + i);
|
NR_LRU_BASE + i);
|
||||||
|
|
377
mm/z3fold.c
377
mm/z3fold.c
|
@ -41,6 +41,7 @@
|
||||||
#include <linux/workqueue.h>
|
#include <linux/workqueue.h>
|
||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
#include <linux/spinlock.h>
|
#include <linux/spinlock.h>
|
||||||
|
#include <linux/rwlock.h>
|
||||||
#include <linux/zpool.h>
|
#include <linux/zpool.h>
|
||||||
#include <linux/magic.h>
|
#include <linux/magic.h>
|
||||||
|
|
||||||
|
@ -90,6 +91,7 @@ struct z3fold_buddy_slots {
|
||||||
*/
|
*/
|
||||||
unsigned long slot[BUDDY_MASK + 1];
|
unsigned long slot[BUDDY_MASK + 1];
|
||||||
unsigned long pool; /* back link + flags */
|
unsigned long pool; /* back link + flags */
|
||||||
|
rwlock_t lock;
|
||||||
};
|
};
|
||||||
#define HANDLE_FLAG_MASK (0x03)
|
#define HANDLE_FLAG_MASK (0x03)
|
||||||
|
|
||||||
|
@ -124,6 +126,7 @@ struct z3fold_header {
|
||||||
unsigned short start_middle;
|
unsigned short start_middle;
|
||||||
unsigned short first_num:2;
|
unsigned short first_num:2;
|
||||||
unsigned short mapped_count:2;
|
unsigned short mapped_count:2;
|
||||||
|
unsigned short foreign_handles:2;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -178,6 +181,19 @@ enum z3fold_page_flags {
|
||||||
PAGE_CLAIMED, /* by either reclaim or free */
|
PAGE_CLAIMED, /* by either reclaim or free */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* handle flags, go under HANDLE_FLAG_MASK
|
||||||
|
*/
|
||||||
|
enum z3fold_handle_flags {
|
||||||
|
HANDLES_ORPHANED = 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Forward declarations
|
||||||
|
*/
|
||||||
|
static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool);
|
||||||
|
static void compact_page_work(struct work_struct *w);
|
||||||
|
|
||||||
/*****************
|
/*****************
|
||||||
* Helpers
|
* Helpers
|
||||||
*****************/
|
*****************/
|
||||||
|
@ -191,8 +207,6 @@ static int size_to_chunks(size_t size)
|
||||||
#define for_each_unbuddied_list(_iter, _begin) \
|
#define for_each_unbuddied_list(_iter, _begin) \
|
||||||
for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
|
for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
|
||||||
|
|
||||||
static void compact_page_work(struct work_struct *w);
|
|
||||||
|
|
||||||
static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
|
static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
|
||||||
gfp_t gfp)
|
gfp_t gfp)
|
||||||
{
|
{
|
||||||
|
@ -204,6 +218,7 @@ static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
|
||||||
if (slots) {
|
if (slots) {
|
||||||
memset(slots->slot, 0, sizeof(slots->slot));
|
memset(slots->slot, 0, sizeof(slots->slot));
|
||||||
slots->pool = (unsigned long)pool;
|
slots->pool = (unsigned long)pool;
|
||||||
|
rwlock_init(&slots->lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
return slots;
|
return slots;
|
||||||
|
@ -219,25 +234,110 @@ static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
|
||||||
return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
|
return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Lock a z3fold page */
|
||||||
|
static inline void z3fold_page_lock(struct z3fold_header *zhdr)
|
||||||
|
{
|
||||||
|
spin_lock(&zhdr->page_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Try to lock a z3fold page */
|
||||||
|
static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
|
||||||
|
{
|
||||||
|
return spin_trylock(&zhdr->page_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Unlock a z3fold page */
|
||||||
|
static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
|
||||||
|
{
|
||||||
|
spin_unlock(&zhdr->page_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static inline struct z3fold_header *__get_z3fold_header(unsigned long handle,
|
||||||
|
bool lock)
|
||||||
|
{
|
||||||
|
struct z3fold_buddy_slots *slots;
|
||||||
|
struct z3fold_header *zhdr;
|
||||||
|
int locked = 0;
|
||||||
|
|
||||||
|
if (!(handle & (1 << PAGE_HEADLESS))) {
|
||||||
|
slots = handle_to_slots(handle);
|
||||||
|
do {
|
||||||
|
unsigned long addr;
|
||||||
|
|
||||||
|
read_lock(&slots->lock);
|
||||||
|
addr = *(unsigned long *)handle;
|
||||||
|
zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
|
||||||
|
if (lock)
|
||||||
|
locked = z3fold_page_trylock(zhdr);
|
||||||
|
read_unlock(&slots->lock);
|
||||||
|
if (locked)
|
||||||
|
break;
|
||||||
|
cpu_relax();
|
||||||
|
} while (lock);
|
||||||
|
} else {
|
||||||
|
zhdr = (struct z3fold_header *)(handle & PAGE_MASK);
|
||||||
|
}
|
||||||
|
|
||||||
|
return zhdr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Returns the z3fold page where a given handle is stored */
|
||||||
|
static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
|
||||||
|
{
|
||||||
|
return __get_z3fold_header(h, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* return locked z3fold page if it's not headless */
|
||||||
|
static inline struct z3fold_header *get_z3fold_header(unsigned long h)
|
||||||
|
{
|
||||||
|
return __get_z3fold_header(h, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void put_z3fold_header(struct z3fold_header *zhdr)
|
||||||
|
{
|
||||||
|
struct page *page = virt_to_page(zhdr);
|
||||||
|
|
||||||
|
if (!test_bit(PAGE_HEADLESS, &page->private))
|
||||||
|
z3fold_page_unlock(zhdr);
|
||||||
|
}
|
||||||
|
|
||||||
static inline void free_handle(unsigned long handle)
|
static inline void free_handle(unsigned long handle)
|
||||||
{
|
{
|
||||||
struct z3fold_buddy_slots *slots;
|
struct z3fold_buddy_slots *slots;
|
||||||
|
struct z3fold_header *zhdr;
|
||||||
int i;
|
int i;
|
||||||
bool is_free;
|
bool is_free;
|
||||||
|
|
||||||
if (handle & (1 << PAGE_HEADLESS))
|
if (handle & (1 << PAGE_HEADLESS))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
WARN_ON(*(unsigned long *)handle == 0);
|
if (WARN_ON(*(unsigned long *)handle == 0))
|
||||||
*(unsigned long *)handle = 0;
|
return;
|
||||||
|
|
||||||
|
zhdr = handle_to_z3fold_header(handle);
|
||||||
slots = handle_to_slots(handle);
|
slots = handle_to_slots(handle);
|
||||||
|
write_lock(&slots->lock);
|
||||||
|
*(unsigned long *)handle = 0;
|
||||||
|
write_unlock(&slots->lock);
|
||||||
|
if (zhdr->slots == slots)
|
||||||
|
return; /* simple case, nothing else to do */
|
||||||
|
|
||||||
|
/* we are freeing a foreign handle if we are here */
|
||||||
|
zhdr->foreign_handles--;
|
||||||
is_free = true;
|
is_free = true;
|
||||||
|
read_lock(&slots->lock);
|
||||||
|
if (!test_bit(HANDLES_ORPHANED, &slots->pool)) {
|
||||||
|
read_unlock(&slots->lock);
|
||||||
|
return;
|
||||||
|
}
|
||||||
for (i = 0; i <= BUDDY_MASK; i++) {
|
for (i = 0; i <= BUDDY_MASK; i++) {
|
||||||
if (slots->slot[i]) {
|
if (slots->slot[i]) {
|
||||||
is_free = false;
|
is_free = false;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
read_unlock(&slots->lock);
|
||||||
|
|
||||||
if (is_free) {
|
if (is_free) {
|
||||||
struct z3fold_pool *pool = slots_to_pool(slots);
|
struct z3fold_pool *pool = slots_to_pool(slots);
|
||||||
|
@ -322,6 +422,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
|
||||||
zhdr->first_num = 0;
|
zhdr->first_num = 0;
|
||||||
zhdr->start_middle = 0;
|
zhdr->start_middle = 0;
|
||||||
zhdr->cpu = -1;
|
zhdr->cpu = -1;
|
||||||
|
zhdr->foreign_handles = 0;
|
||||||
zhdr->slots = slots;
|
zhdr->slots = slots;
|
||||||
zhdr->pool = pool;
|
zhdr->pool = pool;
|
||||||
INIT_LIST_HEAD(&zhdr->buddy);
|
INIT_LIST_HEAD(&zhdr->buddy);
|
||||||
|
@ -341,24 +442,6 @@ static void free_z3fold_page(struct page *page, bool headless)
|
||||||
__free_page(page);
|
__free_page(page);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Lock a z3fold page */
|
|
||||||
static inline void z3fold_page_lock(struct z3fold_header *zhdr)
|
|
||||||
{
|
|
||||||
spin_lock(&zhdr->page_lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Try to lock a z3fold page */
|
|
||||||
static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
|
|
||||||
{
|
|
||||||
return spin_trylock(&zhdr->page_lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Unlock a z3fold page */
|
|
||||||
static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
|
|
||||||
{
|
|
||||||
spin_unlock(&zhdr->page_lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Helper function to build the index */
|
/* Helper function to build the index */
|
||||||
static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
|
static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
|
||||||
{
|
{
|
||||||
|
@ -389,7 +472,9 @@ static unsigned long __encode_handle(struct z3fold_header *zhdr,
|
||||||
if (bud == LAST)
|
if (bud == LAST)
|
||||||
h |= (zhdr->last_chunks << BUDDY_SHIFT);
|
h |= (zhdr->last_chunks << BUDDY_SHIFT);
|
||||||
|
|
||||||
|
write_lock(&slots->lock);
|
||||||
slots->slot[idx] = h;
|
slots->slot[idx] = h;
|
||||||
|
write_unlock(&slots->lock);
|
||||||
return (unsigned long)&slots->slot[idx];
|
return (unsigned long)&slots->slot[idx];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -398,22 +483,15 @@ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
|
||||||
return __encode_handle(zhdr, zhdr->slots, bud);
|
return __encode_handle(zhdr, zhdr->slots, bud);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Returns the z3fold page where a given handle is stored */
|
|
||||||
static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
|
|
||||||
{
|
|
||||||
unsigned long addr = h;
|
|
||||||
|
|
||||||
if (!(addr & (1 << PAGE_HEADLESS)))
|
|
||||||
addr = *(unsigned long *)h;
|
|
||||||
|
|
||||||
return (struct z3fold_header *)(addr & PAGE_MASK);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* only for LAST bud, returns zero otherwise */
|
/* only for LAST bud, returns zero otherwise */
|
||||||
static unsigned short handle_to_chunks(unsigned long handle)
|
static unsigned short handle_to_chunks(unsigned long handle)
|
||||||
{
|
{
|
||||||
unsigned long addr = *(unsigned long *)handle;
|
struct z3fold_buddy_slots *slots = handle_to_slots(handle);
|
||||||
|
unsigned long addr;
|
||||||
|
|
||||||
|
read_lock(&slots->lock);
|
||||||
|
addr = *(unsigned long *)handle;
|
||||||
|
read_unlock(&slots->lock);
|
||||||
return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
|
return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -425,10 +503,13 @@ static unsigned short handle_to_chunks(unsigned long handle)
|
||||||
static enum buddy handle_to_buddy(unsigned long handle)
|
static enum buddy handle_to_buddy(unsigned long handle)
|
||||||
{
|
{
|
||||||
struct z3fold_header *zhdr;
|
struct z3fold_header *zhdr;
|
||||||
|
struct z3fold_buddy_slots *slots = handle_to_slots(handle);
|
||||||
unsigned long addr;
|
unsigned long addr;
|
||||||
|
|
||||||
|
read_lock(&slots->lock);
|
||||||
WARN_ON(handle & (1 << PAGE_HEADLESS));
|
WARN_ON(handle & (1 << PAGE_HEADLESS));
|
||||||
addr = *(unsigned long *)handle;
|
addr = *(unsigned long *)handle;
|
||||||
|
read_unlock(&slots->lock);
|
||||||
zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
|
zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
|
||||||
return (addr - zhdr->first_num) & BUDDY_MASK;
|
return (addr - zhdr->first_num) & BUDDY_MASK;
|
||||||
}
|
}
|
||||||
|
@ -442,6 +523,8 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
|
||||||
{
|
{
|
||||||
struct page *page = virt_to_page(zhdr);
|
struct page *page = virt_to_page(zhdr);
|
||||||
struct z3fold_pool *pool = zhdr_to_pool(zhdr);
|
struct z3fold_pool *pool = zhdr_to_pool(zhdr);
|
||||||
|
bool is_free = true;
|
||||||
|
int i;
|
||||||
|
|
||||||
WARN_ON(!list_empty(&zhdr->buddy));
|
WARN_ON(!list_empty(&zhdr->buddy));
|
||||||
set_bit(PAGE_STALE, &page->private);
|
set_bit(PAGE_STALE, &page->private);
|
||||||
|
@ -450,8 +533,25 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
|
||||||
if (!list_empty(&page->lru))
|
if (!list_empty(&page->lru))
|
||||||
list_del_init(&page->lru);
|
list_del_init(&page->lru);
|
||||||
spin_unlock(&pool->lock);
|
spin_unlock(&pool->lock);
|
||||||
|
|
||||||
|
/* If there are no foreign handles, free the handles array */
|
||||||
|
read_lock(&zhdr->slots->lock);
|
||||||
|
for (i = 0; i <= BUDDY_MASK; i++) {
|
||||||
|
if (zhdr->slots->slot[i]) {
|
||||||
|
is_free = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!is_free)
|
||||||
|
set_bit(HANDLES_ORPHANED, &zhdr->slots->pool);
|
||||||
|
read_unlock(&zhdr->slots->lock);
|
||||||
|
|
||||||
|
if (is_free)
|
||||||
|
kmem_cache_free(pool->c_handle, zhdr->slots);
|
||||||
|
|
||||||
if (locked)
|
if (locked)
|
||||||
z3fold_page_unlock(zhdr);
|
z3fold_page_unlock(zhdr);
|
||||||
|
|
||||||
spin_lock(&pool->stale_lock);
|
spin_lock(&pool->stale_lock);
|
||||||
list_add(&zhdr->buddy, &pool->stale);
|
list_add(&zhdr->buddy, &pool->stale);
|
||||||
queue_work(pool->release_wq, &pool->work);
|
queue_work(pool->release_wq, &pool->work);
|
||||||
|
@ -479,6 +579,7 @@ static void release_z3fold_page_locked_list(struct kref *ref)
|
||||||
struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
|
struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
|
||||||
refcount);
|
refcount);
|
||||||
struct z3fold_pool *pool = zhdr_to_pool(zhdr);
|
struct z3fold_pool *pool = zhdr_to_pool(zhdr);
|
||||||
|
|
||||||
spin_lock(&pool->lock);
|
spin_lock(&pool->lock);
|
||||||
list_del_init(&zhdr->buddy);
|
list_del_init(&zhdr->buddy);
|
||||||
spin_unlock(&pool->lock);
|
spin_unlock(&pool->lock);
|
||||||
|
@ -559,6 +660,119 @@ static inline void *mchunk_memmove(struct z3fold_header *zhdr,
|
||||||
zhdr->middle_chunks << CHUNK_SHIFT);
|
zhdr->middle_chunks << CHUNK_SHIFT);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool buddy_single(struct z3fold_header *zhdr)
|
||||||
|
{
|
||||||
|
return !((zhdr->first_chunks && zhdr->middle_chunks) ||
|
||||||
|
(zhdr->first_chunks && zhdr->last_chunks) ||
|
||||||
|
(zhdr->middle_chunks && zhdr->last_chunks));
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
|
||||||
|
{
|
||||||
|
struct z3fold_pool *pool = zhdr_to_pool(zhdr);
|
||||||
|
void *p = zhdr;
|
||||||
|
unsigned long old_handle = 0;
|
||||||
|
size_t sz = 0;
|
||||||
|
struct z3fold_header *new_zhdr = NULL;
|
||||||
|
int first_idx = __idx(zhdr, FIRST);
|
||||||
|
int middle_idx = __idx(zhdr, MIDDLE);
|
||||||
|
int last_idx = __idx(zhdr, LAST);
|
||||||
|
unsigned short *moved_chunks = NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* No need to protect slots here -- all the slots are "local" and
|
||||||
|
* the page lock is already taken
|
||||||
|
*/
|
||||||
|
if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) {
|
||||||
|
p += ZHDR_SIZE_ALIGNED;
|
||||||
|
sz = zhdr->first_chunks << CHUNK_SHIFT;
|
||||||
|
old_handle = (unsigned long)&zhdr->slots->slot[first_idx];
|
||||||
|
moved_chunks = &zhdr->first_chunks;
|
||||||
|
} else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) {
|
||||||
|
p += zhdr->start_middle << CHUNK_SHIFT;
|
||||||
|
sz = zhdr->middle_chunks << CHUNK_SHIFT;
|
||||||
|
old_handle = (unsigned long)&zhdr->slots->slot[middle_idx];
|
||||||
|
moved_chunks = &zhdr->middle_chunks;
|
||||||
|
} else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) {
|
||||||
|
p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
|
||||||
|
sz = zhdr->last_chunks << CHUNK_SHIFT;
|
||||||
|
old_handle = (unsigned long)&zhdr->slots->slot[last_idx];
|
||||||
|
moved_chunks = &zhdr->last_chunks;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sz > 0) {
|
||||||
|
enum buddy new_bud = HEADLESS;
|
||||||
|
short chunks = size_to_chunks(sz);
|
||||||
|
void *q;
|
||||||
|
|
||||||
|
new_zhdr = __z3fold_alloc(pool, sz, false);
|
||||||
|
if (!new_zhdr)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
if (WARN_ON(new_zhdr == zhdr))
|
||||||
|
goto out_fail;
|
||||||
|
|
||||||
|
if (new_zhdr->first_chunks == 0) {
|
||||||
|
if (new_zhdr->middle_chunks != 0 &&
|
||||||
|
chunks >= new_zhdr->start_middle) {
|
||||||
|
new_bud = LAST;
|
||||||
|
} else {
|
||||||
|
new_bud = FIRST;
|
||||||
|
}
|
||||||
|
} else if (new_zhdr->last_chunks == 0) {
|
||||||
|
new_bud = LAST;
|
||||||
|
} else if (new_zhdr->middle_chunks == 0) {
|
||||||
|
new_bud = MIDDLE;
|
||||||
|
}
|
||||||
|
q = new_zhdr;
|
||||||
|
switch (new_bud) {
|
||||||
|
case FIRST:
|
||||||
|
new_zhdr->first_chunks = chunks;
|
||||||
|
q += ZHDR_SIZE_ALIGNED;
|
||||||
|
break;
|
||||||
|
case MIDDLE:
|
||||||
|
new_zhdr->middle_chunks = chunks;
|
||||||
|
new_zhdr->start_middle =
|
||||||
|
new_zhdr->first_chunks + ZHDR_CHUNKS;
|
||||||
|
q += new_zhdr->start_middle << CHUNK_SHIFT;
|
||||||
|
break;
|
||||||
|
case LAST:
|
||||||
|
new_zhdr->last_chunks = chunks;
|
||||||
|
q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
goto out_fail;
|
||||||
|
}
|
||||||
|
new_zhdr->foreign_handles++;
|
||||||
|
memcpy(q, p, sz);
|
||||||
|
write_lock(&zhdr->slots->lock);
|
||||||
|
*(unsigned long *)old_handle = (unsigned long)new_zhdr +
|
||||||
|
__idx(new_zhdr, new_bud);
|
||||||
|
if (new_bud == LAST)
|
||||||
|
*(unsigned long *)old_handle |=
|
||||||
|
(new_zhdr->last_chunks << BUDDY_SHIFT);
|
||||||
|
write_unlock(&zhdr->slots->lock);
|
||||||
|
add_to_unbuddied(pool, new_zhdr);
|
||||||
|
z3fold_page_unlock(new_zhdr);
|
||||||
|
|
||||||
|
*moved_chunks = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new_zhdr;
|
||||||
|
|
||||||
|
out_fail:
|
||||||
|
if (new_zhdr) {
|
||||||
|
if (kref_put(&new_zhdr->refcount, release_z3fold_page_locked))
|
||||||
|
atomic64_dec(&pool->pages_nr);
|
||||||
|
else {
|
||||||
|
add_to_unbuddied(pool, new_zhdr);
|
||||||
|
z3fold_page_unlock(new_zhdr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
#define BIG_CHUNK_GAP 3
|
#define BIG_CHUNK_GAP 3
|
||||||
/* Has to be called with lock held */
|
/* Has to be called with lock held */
|
||||||
static int z3fold_compact_page(struct z3fold_header *zhdr)
|
static int z3fold_compact_page(struct z3fold_header *zhdr)
|
||||||
|
@ -638,6 +852,15 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!zhdr->foreign_handles && buddy_single(zhdr) &&
|
||||||
|
zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
|
||||||
|
if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
|
||||||
|
atomic64_dec(&pool->pages_nr);
|
||||||
|
else
|
||||||
|
z3fold_page_unlock(zhdr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
z3fold_compact_page(zhdr);
|
z3fold_compact_page(zhdr);
|
||||||
add_to_unbuddied(pool, zhdr);
|
add_to_unbuddied(pool, zhdr);
|
||||||
z3fold_page_unlock(zhdr);
|
z3fold_page_unlock(zhdr);
|
||||||
|
@ -690,7 +913,8 @@ lookup:
|
||||||
spin_unlock(&pool->lock);
|
spin_unlock(&pool->lock);
|
||||||
|
|
||||||
page = virt_to_page(zhdr);
|
page = virt_to_page(zhdr);
|
||||||
if (test_bit(NEEDS_COMPACTING, &page->private)) {
|
if (test_bit(NEEDS_COMPACTING, &page->private) ||
|
||||||
|
test_bit(PAGE_CLAIMED, &page->private)) {
|
||||||
z3fold_page_unlock(zhdr);
|
z3fold_page_unlock(zhdr);
|
||||||
zhdr = NULL;
|
zhdr = NULL;
|
||||||
put_cpu_ptr(pool->unbuddied);
|
put_cpu_ptr(pool->unbuddied);
|
||||||
|
@ -734,7 +958,8 @@ lookup:
|
||||||
spin_unlock(&pool->lock);
|
spin_unlock(&pool->lock);
|
||||||
|
|
||||||
page = virt_to_page(zhdr);
|
page = virt_to_page(zhdr);
|
||||||
if (test_bit(NEEDS_COMPACTING, &page->private)) {
|
if (test_bit(NEEDS_COMPACTING, &page->private) ||
|
||||||
|
test_bit(PAGE_CLAIMED, &page->private)) {
|
||||||
z3fold_page_unlock(zhdr);
|
z3fold_page_unlock(zhdr);
|
||||||
zhdr = NULL;
|
zhdr = NULL;
|
||||||
if (can_sleep)
|
if (can_sleep)
|
||||||
|
@ -1000,7 +1225,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
|
||||||
enum buddy bud;
|
enum buddy bud;
|
||||||
bool page_claimed;
|
bool page_claimed;
|
||||||
|
|
||||||
zhdr = handle_to_z3fold_header(handle);
|
zhdr = get_z3fold_header(handle);
|
||||||
page = virt_to_page(zhdr);
|
page = virt_to_page(zhdr);
|
||||||
page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
|
page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
|
||||||
|
|
||||||
|
@ -1014,6 +1239,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
|
||||||
spin_lock(&pool->lock);
|
spin_lock(&pool->lock);
|
||||||
list_del(&page->lru);
|
list_del(&page->lru);
|
||||||
spin_unlock(&pool->lock);
|
spin_unlock(&pool->lock);
|
||||||
|
put_z3fold_header(zhdr);
|
||||||
free_z3fold_page(page, true);
|
free_z3fold_page(page, true);
|
||||||
atomic64_dec(&pool->pages_nr);
|
atomic64_dec(&pool->pages_nr);
|
||||||
}
|
}
|
||||||
|
@ -1021,7 +1247,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Non-headless case */
|
/* Non-headless case */
|
||||||
z3fold_page_lock(zhdr);
|
|
||||||
bud = handle_to_buddy(handle);
|
bud = handle_to_buddy(handle);
|
||||||
|
|
||||||
switch (bud) {
|
switch (bud) {
|
||||||
|
@ -1037,11 +1262,13 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
|
||||||
default:
|
default:
|
||||||
pr_err("%s: unknown bud %d\n", __func__, bud);
|
pr_err("%s: unknown bud %d\n", __func__, bud);
|
||||||
WARN_ON(1);
|
WARN_ON(1);
|
||||||
z3fold_page_unlock(zhdr);
|
put_z3fold_header(zhdr);
|
||||||
|
clear_bit(PAGE_CLAIMED, &page->private);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
free_handle(handle);
|
if (!page_claimed)
|
||||||
|
free_handle(handle);
|
||||||
if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
|
if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
|
||||||
atomic64_dec(&pool->pages_nr);
|
atomic64_dec(&pool->pages_nr);
|
||||||
return;
|
return;
|
||||||
|
@ -1053,7 +1280,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
|
||||||
}
|
}
|
||||||
if (unlikely(PageIsolated(page)) ||
|
if (unlikely(PageIsolated(page)) ||
|
||||||
test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
|
test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
|
||||||
z3fold_page_unlock(zhdr);
|
put_z3fold_header(zhdr);
|
||||||
clear_bit(PAGE_CLAIMED, &page->private);
|
clear_bit(PAGE_CLAIMED, &page->private);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -1063,14 +1290,14 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
|
||||||
spin_unlock(&pool->lock);
|
spin_unlock(&pool->lock);
|
||||||
zhdr->cpu = -1;
|
zhdr->cpu = -1;
|
||||||
kref_get(&zhdr->refcount);
|
kref_get(&zhdr->refcount);
|
||||||
do_compact_page(zhdr, true);
|
|
||||||
clear_bit(PAGE_CLAIMED, &page->private);
|
clear_bit(PAGE_CLAIMED, &page->private);
|
||||||
|
do_compact_page(zhdr, true);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
kref_get(&zhdr->refcount);
|
kref_get(&zhdr->refcount);
|
||||||
queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
|
|
||||||
clear_bit(PAGE_CLAIMED, &page->private);
|
clear_bit(PAGE_CLAIMED, &page->private);
|
||||||
z3fold_page_unlock(zhdr);
|
queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
|
||||||
|
put_z3fold_header(zhdr);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1111,11 +1338,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
|
||||||
*/
|
*/
|
||||||
static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
|
static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
|
||||||
{
|
{
|
||||||
int i, ret = 0;
|
int i, ret = -1;
|
||||||
struct z3fold_header *zhdr = NULL;
|
struct z3fold_header *zhdr = NULL;
|
||||||
struct page *page = NULL;
|
struct page *page = NULL;
|
||||||
struct list_head *pos;
|
struct list_head *pos;
|
||||||
struct z3fold_buddy_slots slots;
|
|
||||||
unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
|
unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
|
||||||
|
|
||||||
spin_lock(&pool->lock);
|
spin_lock(&pool->lock);
|
||||||
|
@ -1153,6 +1379,12 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
|
||||||
zhdr = NULL;
|
zhdr = NULL;
|
||||||
continue; /* can't evict at this point */
|
continue; /* can't evict at this point */
|
||||||
}
|
}
|
||||||
|
if (zhdr->foreign_handles) {
|
||||||
|
clear_bit(PAGE_CLAIMED, &page->private);
|
||||||
|
z3fold_page_unlock(zhdr);
|
||||||
|
zhdr = NULL;
|
||||||
|
continue; /* can't evict such page */
|
||||||
|
}
|
||||||
kref_get(&zhdr->refcount);
|
kref_get(&zhdr->refcount);
|
||||||
list_del_init(&zhdr->buddy);
|
list_del_init(&zhdr->buddy);
|
||||||
zhdr->cpu = -1;
|
zhdr->cpu = -1;
|
||||||
|
@ -1176,39 +1408,38 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
|
||||||
last_handle = 0;
|
last_handle = 0;
|
||||||
middle_handle = 0;
|
middle_handle = 0;
|
||||||
if (zhdr->first_chunks)
|
if (zhdr->first_chunks)
|
||||||
first_handle = __encode_handle(zhdr, &slots,
|
first_handle = encode_handle(zhdr, FIRST);
|
||||||
FIRST);
|
|
||||||
if (zhdr->middle_chunks)
|
if (zhdr->middle_chunks)
|
||||||
middle_handle = __encode_handle(zhdr, &slots,
|
middle_handle = encode_handle(zhdr, MIDDLE);
|
||||||
MIDDLE);
|
|
||||||
if (zhdr->last_chunks)
|
if (zhdr->last_chunks)
|
||||||
last_handle = __encode_handle(zhdr, &slots,
|
last_handle = encode_handle(zhdr, LAST);
|
||||||
LAST);
|
|
||||||
/*
|
/*
|
||||||
* it's safe to unlock here because we hold a
|
* it's safe to unlock here because we hold a
|
||||||
* reference to this page
|
* reference to this page
|
||||||
*/
|
*/
|
||||||
z3fold_page_unlock(zhdr);
|
z3fold_page_unlock(zhdr);
|
||||||
} else {
|
} else {
|
||||||
first_handle = __encode_handle(zhdr, &slots, HEADLESS);
|
first_handle = encode_handle(zhdr, HEADLESS);
|
||||||
last_handle = middle_handle = 0;
|
last_handle = middle_handle = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Issue the eviction callback(s) */
|
/* Issue the eviction callback(s) */
|
||||||
if (middle_handle) {
|
if (middle_handle) {
|
||||||
ret = pool->ops->evict(pool, middle_handle);
|
ret = pool->ops->evict(pool, middle_handle);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto next;
|
goto next;
|
||||||
|
free_handle(middle_handle);
|
||||||
}
|
}
|
||||||
if (first_handle) {
|
if (first_handle) {
|
||||||
ret = pool->ops->evict(pool, first_handle);
|
ret = pool->ops->evict(pool, first_handle);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto next;
|
goto next;
|
||||||
|
free_handle(first_handle);
|
||||||
}
|
}
|
||||||
if (last_handle) {
|
if (last_handle) {
|
||||||
ret = pool->ops->evict(pool, last_handle);
|
ret = pool->ops->evict(pool, last_handle);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto next;
|
goto next;
|
||||||
|
free_handle(last_handle);
|
||||||
}
|
}
|
||||||
next:
|
next:
|
||||||
if (test_bit(PAGE_HEADLESS, &page->private)) {
|
if (test_bit(PAGE_HEADLESS, &page->private)) {
|
||||||
|
@ -1264,14 +1495,13 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
|
||||||
void *addr;
|
void *addr;
|
||||||
enum buddy buddy;
|
enum buddy buddy;
|
||||||
|
|
||||||
zhdr = handle_to_z3fold_header(handle);
|
zhdr = get_z3fold_header(handle);
|
||||||
addr = zhdr;
|
addr = zhdr;
|
||||||
page = virt_to_page(zhdr);
|
page = virt_to_page(zhdr);
|
||||||
|
|
||||||
if (test_bit(PAGE_HEADLESS, &page->private))
|
if (test_bit(PAGE_HEADLESS, &page->private))
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
z3fold_page_lock(zhdr);
|
|
||||||
buddy = handle_to_buddy(handle);
|
buddy = handle_to_buddy(handle);
|
||||||
switch (buddy) {
|
switch (buddy) {
|
||||||
case FIRST:
|
case FIRST:
|
||||||
|
@ -1293,8 +1523,8 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
|
||||||
|
|
||||||
if (addr)
|
if (addr)
|
||||||
zhdr->mapped_count++;
|
zhdr->mapped_count++;
|
||||||
z3fold_page_unlock(zhdr);
|
|
||||||
out:
|
out:
|
||||||
|
put_z3fold_header(zhdr);
|
||||||
return addr;
|
return addr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1309,18 +1539,17 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
|
||||||
struct page *page;
|
struct page *page;
|
||||||
enum buddy buddy;
|
enum buddy buddy;
|
||||||
|
|
||||||
zhdr = handle_to_z3fold_header(handle);
|
zhdr = get_z3fold_header(handle);
|
||||||
page = virt_to_page(zhdr);
|
page = virt_to_page(zhdr);
|
||||||
|
|
||||||
if (test_bit(PAGE_HEADLESS, &page->private))
|
if (test_bit(PAGE_HEADLESS, &page->private))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
z3fold_page_lock(zhdr);
|
|
||||||
buddy = handle_to_buddy(handle);
|
buddy = handle_to_buddy(handle);
|
||||||
if (buddy == MIDDLE)
|
if (buddy == MIDDLE)
|
||||||
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
|
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
|
||||||
zhdr->mapped_count--;
|
zhdr->mapped_count--;
|
||||||
z3fold_page_unlock(zhdr);
|
put_z3fold_header(zhdr);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1352,19 +1581,21 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
|
||||||
test_bit(PAGE_STALE, &page->private))
|
test_bit(PAGE_STALE, &page->private))
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
pool = zhdr_to_pool(zhdr);
|
if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
pool = zhdr_to_pool(zhdr);
|
||||||
|
spin_lock(&pool->lock);
|
||||||
|
if (!list_empty(&zhdr->buddy))
|
||||||
|
list_del_init(&zhdr->buddy);
|
||||||
|
if (!list_empty(&page->lru))
|
||||||
|
list_del_init(&page->lru);
|
||||||
|
spin_unlock(&pool->lock);
|
||||||
|
|
||||||
|
kref_get(&zhdr->refcount);
|
||||||
|
z3fold_page_unlock(zhdr);
|
||||||
|
return true;
|
||||||
|
|
||||||
if (zhdr->mapped_count == 0) {
|
|
||||||
kref_get(&zhdr->refcount);
|
|
||||||
if (!list_empty(&zhdr->buddy))
|
|
||||||
list_del_init(&zhdr->buddy);
|
|
||||||
spin_lock(&pool->lock);
|
|
||||||
if (!list_empty(&page->lru))
|
|
||||||
list_del(&page->lru);
|
|
||||||
spin_unlock(&pool->lock);
|
|
||||||
z3fold_page_unlock(zhdr);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
out:
|
out:
|
||||||
z3fold_page_unlock(zhdr);
|
z3fold_page_unlock(zhdr);
|
||||||
return false;
|
return false;
|
||||||
|
@ -1387,7 +1618,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
|
||||||
if (!z3fold_page_trylock(zhdr)) {
|
if (!z3fold_page_trylock(zhdr)) {
|
||||||
return -EAGAIN;
|
return -EAGAIN;
|
||||||
}
|
}
|
||||||
if (zhdr->mapped_count != 0) {
|
if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
|
||||||
z3fold_page_unlock(zhdr);
|
z3fold_page_unlock(zhdr);
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
}
|
}
|
||||||
|
|
|
@ -87,6 +87,7 @@ algorith||algorithm
|
||||||
algorithmical||algorithmically
|
algorithmical||algorithmically
|
||||||
algoritm||algorithm
|
algoritm||algorithm
|
||||||
algoritms||algorithms
|
algoritms||algorithms
|
||||||
|
algorithmn||algorithm
|
||||||
algorrithm||algorithm
|
algorrithm||algorithm
|
||||||
algorritm||algorithm
|
algorritm||algorithm
|
||||||
aligment||alignment
|
aligment||alignment
|
||||||
|
@ -109,6 +110,7 @@ alredy||already
|
||||||
altough||although
|
altough||although
|
||||||
alue||value
|
alue||value
|
||||||
ambigious||ambiguous
|
ambigious||ambiguous
|
||||||
|
ambigous||ambiguous
|
||||||
amoung||among
|
amoung||among
|
||||||
amout||amount
|
amout||amount
|
||||||
amplifer||amplifier
|
amplifer||amplifier
|
||||||
|
@ -179,6 +181,7 @@ attepmpt||attempt
|
||||||
attnetion||attention
|
attnetion||attention
|
||||||
attruibutes||attributes
|
attruibutes||attributes
|
||||||
authentification||authentication
|
authentification||authentication
|
||||||
|
authenicated||authenticated
|
||||||
automaticaly||automatically
|
automaticaly||automatically
|
||||||
automaticly||automatically
|
automaticly||automatically
|
||||||
automatize||automate
|
automatize||automate
|
||||||
|
@ -286,6 +289,7 @@ claread||cleared
|
||||||
clared||cleared
|
clared||cleared
|
||||||
closeing||closing
|
closeing||closing
|
||||||
clustred||clustered
|
clustred||clustered
|
||||||
|
cnfiguration||configuration
|
||||||
coexistance||coexistence
|
coexistance||coexistence
|
||||||
colescing||coalescing
|
colescing||coalescing
|
||||||
collapsable||collapsible
|
collapsable||collapsible
|
||||||
|
@ -325,9 +329,11 @@ comression||compression
|
||||||
comunication||communication
|
comunication||communication
|
||||||
conbination||combination
|
conbination||combination
|
||||||
conditionaly||conditionally
|
conditionaly||conditionally
|
||||||
|
conditon||condition
|
||||||
conected||connected
|
conected||connected
|
||||||
conector||connector
|
conector||connector
|
||||||
connecetd||connected
|
connecetd||connected
|
||||||
|
configration||configuration
|
||||||
configuartion||configuration
|
configuartion||configuration
|
||||||
configuation||configuration
|
configuation||configuration
|
||||||
configued||configured
|
configued||configured
|
||||||
|
@ -347,6 +353,7 @@ containts||contains
|
||||||
contaisn||contains
|
contaisn||contains
|
||||||
contant||contact
|
contant||contact
|
||||||
contence||contents
|
contence||contents
|
||||||
|
contiguos||contiguous
|
||||||
continious||continuous
|
continious||continuous
|
||||||
continous||continuous
|
continous||continuous
|
||||||
continously||continuously
|
continously||continuously
|
||||||
|
@ -380,6 +387,7 @@ cylic||cyclic
|
||||||
dafault||default
|
dafault||default
|
||||||
deafult||default
|
deafult||default
|
||||||
deamon||daemon
|
deamon||daemon
|
||||||
|
debouce||debounce
|
||||||
decompres||decompress
|
decompres||decompress
|
||||||
decsribed||described
|
decsribed||described
|
||||||
decription||description
|
decription||description
|
||||||
|
@ -448,6 +456,7 @@ diffrent||different
|
||||||
differenciate||differentiate
|
differenciate||differentiate
|
||||||
diffrentiate||differentiate
|
diffrentiate||differentiate
|
||||||
difinition||definition
|
difinition||definition
|
||||||
|
digial||digital
|
||||||
dimention||dimension
|
dimention||dimension
|
||||||
dimesions||dimensions
|
dimesions||dimensions
|
||||||
dispalying||displaying
|
dispalying||displaying
|
||||||
|
@ -489,6 +498,7 @@ droput||dropout
|
||||||
druing||during
|
druing||during
|
||||||
dynmaic||dynamic
|
dynmaic||dynamic
|
||||||
eanable||enable
|
eanable||enable
|
||||||
|
eanble||enable
|
||||||
easilly||easily
|
easilly||easily
|
||||||
ecspecially||especially
|
ecspecially||especially
|
||||||
edditable||editable
|
edditable||editable
|
||||||
|
@ -502,6 +512,7 @@ elementry||elementary
|
||||||
eletronic||electronic
|
eletronic||electronic
|
||||||
embeded||embedded
|
embeded||embedded
|
||||||
enabledi||enabled
|
enabledi||enabled
|
||||||
|
enbale||enable
|
||||||
enble||enable
|
enble||enable
|
||||||
enchanced||enhanced
|
enchanced||enhanced
|
||||||
encorporating||incorporating
|
encorporating||incorporating
|
||||||
|
@ -536,6 +547,7 @@ excellant||excellent
|
||||||
execeeded||exceeded
|
execeeded||exceeded
|
||||||
execeeds||exceeds
|
execeeds||exceeds
|
||||||
exeed||exceed
|
exeed||exceed
|
||||||
|
exeuction||execution
|
||||||
existance||existence
|
existance||existence
|
||||||
existant||existent
|
existant||existent
|
||||||
exixt||exist
|
exixt||exist
|
||||||
|
@ -601,10 +613,12 @@ frambuffer||framebuffer
|
||||||
framming||framing
|
framming||framing
|
||||||
framwork||framework
|
framwork||framework
|
||||||
frequncy||frequency
|
frequncy||frequency
|
||||||
|
frequancy||frequency
|
||||||
frome||from
|
frome||from
|
||||||
fucntion||function
|
fucntion||function
|
||||||
fuction||function
|
fuction||function
|
||||||
fuctions||functions
|
fuctions||functions
|
||||||
|
fullill||fulfill
|
||||||
funcation||function
|
funcation||function
|
||||||
funcion||function
|
funcion||function
|
||||||
functionallity||functionality
|
functionallity||functionality
|
||||||
|
@ -642,6 +656,7 @@ happend||happened
|
||||||
harware||hardware
|
harware||hardware
|
||||||
heirarchically||hierarchically
|
heirarchically||hierarchically
|
||||||
helpfull||helpful
|
helpfull||helpful
|
||||||
|
hexdecimal||hexadecimal
|
||||||
hybernate||hibernate
|
hybernate||hibernate
|
||||||
hierachy||hierarchy
|
hierachy||hierarchy
|
||||||
hierarchie||hierarchy
|
hierarchie||hierarchy
|
||||||
|
@ -709,12 +724,14 @@ initalize||initialize
|
||||||
initation||initiation
|
initation||initiation
|
||||||
initators||initiators
|
initators||initiators
|
||||||
initialiazation||initialization
|
initialiazation||initialization
|
||||||
|
initializationg||initialization
|
||||||
initializiation||initialization
|
initializiation||initialization
|
||||||
initialze||initialize
|
initialze||initialize
|
||||||
initialzed||initialized
|
initialzed||initialized
|
||||||
initialzing||initializing
|
initialzing||initializing
|
||||||
initilization||initialization
|
initilization||initialization
|
||||||
initilize||initialize
|
initilize||initialize
|
||||||
|
initliaze||initialize
|
||||||
inofficial||unofficial
|
inofficial||unofficial
|
||||||
inrerface||interface
|
inrerface||interface
|
||||||
insititute||institute
|
insititute||institute
|
||||||
|
@ -779,6 +796,7 @@ itertation||iteration
|
||||||
itslef||itself
|
itslef||itself
|
||||||
jave||java
|
jave||java
|
||||||
jeffies||jiffies
|
jeffies||jiffies
|
||||||
|
jumpimng||jumping
|
||||||
juse||just
|
juse||just
|
||||||
jus||just
|
jus||just
|
||||||
kown||known
|
kown||known
|
||||||
|
@ -839,6 +857,7 @@ messags||messages
|
||||||
messgaes||messages
|
messgaes||messages
|
||||||
messsage||message
|
messsage||message
|
||||||
messsages||messages
|
messsages||messages
|
||||||
|
metdata||metadata
|
||||||
micropone||microphone
|
micropone||microphone
|
||||||
microprocesspr||microprocessor
|
microprocesspr||microprocessor
|
||||||
migrateable||migratable
|
migrateable||migratable
|
||||||
|
@ -857,6 +876,7 @@ mismactch||mismatch
|
||||||
missign||missing
|
missign||missing
|
||||||
missmanaged||mismanaged
|
missmanaged||mismanaged
|
||||||
missmatch||mismatch
|
missmatch||mismatch
|
||||||
|
misssing||missing
|
||||||
miximum||maximum
|
miximum||maximum
|
||||||
mmnemonic||mnemonic
|
mmnemonic||mnemonic
|
||||||
mnay||many
|
mnay||many
|
||||||
|
@ -912,6 +932,7 @@ occured||occurred
|
||||||
occuring||occurring
|
occuring||occurring
|
||||||
offser||offset
|
offser||offset
|
||||||
offet||offset
|
offet||offset
|
||||||
|
offlaod||offload
|
||||||
offloded||offloaded
|
offloded||offloaded
|
||||||
offseting||offsetting
|
offseting||offsetting
|
||||||
omited||omitted
|
omited||omitted
|
||||||
|
@ -993,6 +1014,7 @@ poiter||pointer
|
||||||
posible||possible
|
posible||possible
|
||||||
positon||position
|
positon||position
|
||||||
possibilites||possibilities
|
possibilites||possibilities
|
||||||
|
potocol||protocol
|
||||||
powerfull||powerful
|
powerfull||powerful
|
||||||
pramater||parameter
|
pramater||parameter
|
||||||
preamle||preamble
|
preamle||preamble
|
||||||
|
@ -1061,11 +1083,13 @@ psychadelic||psychedelic
|
||||||
pwoer||power
|
pwoer||power
|
||||||
queing||queuing
|
queing||queuing
|
||||||
quering||querying
|
quering||querying
|
||||||
|
queus||queues
|
||||||
randomally||randomly
|
randomally||randomly
|
||||||
raoming||roaming
|
raoming||roaming
|
||||||
reasearcher||researcher
|
reasearcher||researcher
|
||||||
reasearchers||researchers
|
reasearchers||researchers
|
||||||
reasearch||research
|
reasearch||research
|
||||||
|
receieve||receive
|
||||||
recepient||recipient
|
recepient||recipient
|
||||||
recevied||received
|
recevied||received
|
||||||
receving||receiving
|
receving||receiving
|
||||||
|
@ -1166,6 +1190,7 @@ scaleing||scaling
|
||||||
scaned||scanned
|
scaned||scanned
|
||||||
scaning||scanning
|
scaning||scanning
|
||||||
scarch||search
|
scarch||search
|
||||||
|
schdule||schedule
|
||||||
seach||search
|
seach||search
|
||||||
searchs||searches
|
searchs||searches
|
||||||
secquence||sequence
|
secquence||sequence
|
||||||
|
@ -1308,6 +1333,7 @@ taskelt||tasklet
|
||||||
teh||the
|
teh||the
|
||||||
temorary||temporary
|
temorary||temporary
|
||||||
temproarily||temporarily
|
temproarily||temporarily
|
||||||
|
temperture||temperature
|
||||||
thead||thread
|
thead||thread
|
||||||
therfore||therefore
|
therfore||therefore
|
||||||
thier||their
|
thier||their
|
||||||
|
@ -1354,6 +1380,7 @@ uknown||unknown
|
||||||
usupported||unsupported
|
usupported||unsupported
|
||||||
uncommited||uncommitted
|
uncommited||uncommitted
|
||||||
unconditionaly||unconditionally
|
unconditionaly||unconditionally
|
||||||
|
undeflow||underflow
|
||||||
underun||underrun
|
underun||underrun
|
||||||
unecessary||unnecessary
|
unecessary||unnecessary
|
||||||
unexecpted||unexpected
|
unexecpted||unexpected
|
||||||
|
@ -1414,6 +1441,7 @@ varible||variable
|
||||||
varient||variant
|
varient||variant
|
||||||
vaule||value
|
vaule||value
|
||||||
verbse||verbose
|
verbse||verbose
|
||||||
|
veify||verify
|
||||||
verisons||versions
|
verisons||versions
|
||||||
verison||version
|
verison||version
|
||||||
verson||version
|
verson||version
|
||||||
|
|
|
@ -290,6 +290,40 @@ static void mfd_assert_read_shared(int fd)
|
||||||
munmap(p, mfd_def_size);
|
munmap(p, mfd_def_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void mfd_assert_fork_private_write(int fd)
|
||||||
|
{
|
||||||
|
int *p;
|
||||||
|
pid_t pid;
|
||||||
|
|
||||||
|
p = mmap(NULL,
|
||||||
|
mfd_def_size,
|
||||||
|
PROT_READ | PROT_WRITE,
|
||||||
|
MAP_PRIVATE,
|
||||||
|
fd,
|
||||||
|
0);
|
||||||
|
if (p == MAP_FAILED) {
|
||||||
|
printf("mmap() failed: %m\n");
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
p[0] = 22;
|
||||||
|
|
||||||
|
pid = fork();
|
||||||
|
if (pid == 0) {
|
||||||
|
p[0] = 33;
|
||||||
|
exit(0);
|
||||||
|
} else {
|
||||||
|
waitpid(pid, NULL, 0);
|
||||||
|
|
||||||
|
if (p[0] != 22) {
|
||||||
|
printf("MAP_PRIVATE copy-on-write failed: %m\n");
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
munmap(p, mfd_def_size);
|
||||||
|
}
|
||||||
|
|
||||||
static void mfd_assert_write(int fd)
|
static void mfd_assert_write(int fd)
|
||||||
{
|
{
|
||||||
ssize_t l;
|
ssize_t l;
|
||||||
|
@ -760,6 +794,8 @@ static void test_seal_future_write(void)
|
||||||
mfd_assert_read_shared(fd2);
|
mfd_assert_read_shared(fd2);
|
||||||
mfd_fail_write(fd2);
|
mfd_fail_write(fd2);
|
||||||
|
|
||||||
|
mfd_assert_fork_private_write(fd);
|
||||||
|
|
||||||
munmap(p, mfd_def_size);
|
munmap(p, mfd_def_size);
|
||||||
close(fd2);
|
close(fd2);
|
||||||
close(fd);
|
close(fd);
|
||||||
|
|
|
@ -1,2 +1,3 @@
|
||||||
CONFIG_SYSVIPC=y
|
CONFIG_SYSVIPC=y
|
||||||
CONFIG_USERFAULTFD=y
|
CONFIG_USERFAULTFD=y
|
||||||
|
CONFIG_TEST_VMALLOC=m
|
||||||
|
|
Loading…
Add table
Reference in a new issue