mirror of
https://github.com/Fishwaldo/Star64_linux.git
synced 2025-06-24 07:31:41 +00:00
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: - a few misc things - most of MM - KASAN updates * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (102 commits) kasan: separate report parts by empty lines kasan: improve double-free report format kasan: print page description after stacks kasan: improve slab object description kasan: change report header kasan: simplify address description logic kasan: change allocation and freeing stack traces headers kasan: unify report headers kasan: introduce helper functions for determining bug type mm: hwpoison: call shake_page() after try_to_unmap() for mlocked page mm: hwpoison: call shake_page() unconditionally mm/swapfile.c: fix swap space leak in error path of swap_free_entries() mm/gup.c: fix access_ok() argument type mm/truncate: avoid pointless cleancache_invalidate_inode() calls. mm/truncate: bail out early from invalidate_inode_pages2_range() if mapping is empty fs/block_dev: always invalidate cleancache in invalidate_bdev() fs: fix data invalidation in the cleancache during direct IO zram: reduce load operation in page_same_filled zram: use zram_free_page instead of open-coded zram: introduce zram data accessor ...
This commit is contained in:
commit
dd23f273d9
79 changed files with 2156 additions and 1499 deletions
|
@ -871,6 +871,11 @@ PAGE_SIZE multiple when read back.
|
||||||
|
|
||||||
Amount of memory used in network transmission buffers
|
Amount of memory used in network transmission buffers
|
||||||
|
|
||||||
|
shmem
|
||||||
|
|
||||||
|
Amount of cached filesystem data that is swap-backed,
|
||||||
|
such as tmpfs, shm segments, shared anonymous mmap()s
|
||||||
|
|
||||||
file_mapped
|
file_mapped
|
||||||
|
|
||||||
Amount of cached filesystem data mapped with mmap()
|
Amount of cached filesystem data mapped with mmap()
|
||||||
|
|
|
@ -413,6 +413,7 @@ Private_Clean: 0 kB
|
||||||
Private_Dirty: 0 kB
|
Private_Dirty: 0 kB
|
||||||
Referenced: 892 kB
|
Referenced: 892 kB
|
||||||
Anonymous: 0 kB
|
Anonymous: 0 kB
|
||||||
|
LazyFree: 0 kB
|
||||||
AnonHugePages: 0 kB
|
AnonHugePages: 0 kB
|
||||||
ShmemPmdMapped: 0 kB
|
ShmemPmdMapped: 0 kB
|
||||||
Shared_Hugetlb: 0 kB
|
Shared_Hugetlb: 0 kB
|
||||||
|
@ -442,6 +443,11 @@ accessed.
|
||||||
"Anonymous" shows the amount of memory that does not belong to any file. Even
|
"Anonymous" shows the amount of memory that does not belong to any file. Even
|
||||||
a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
|
a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
|
||||||
and a page is modified, the file page is replaced by a private anonymous copy.
|
and a page is modified, the file page is replaced by a private anonymous copy.
|
||||||
|
"LazyFree" shows the amount of memory which is marked by madvise(MADV_FREE).
|
||||||
|
The memory isn't freed immediately with madvise(). It's freed in memory
|
||||||
|
pressure if the memory is clean. Please note that the printed value might
|
||||||
|
be lower than the real value due to optimizations used in the current
|
||||||
|
implementation. If this is not desirable please file a bug report.
|
||||||
"AnonHugePages" shows the ammount of memory backed by transparent hugepage.
|
"AnonHugePages" shows the ammount of memory backed by transparent hugepage.
|
||||||
"ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by
|
"ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by
|
||||||
huge pages.
|
huge pages.
|
||||||
|
|
|
@ -12,6 +12,8 @@ highmem.txt
|
||||||
- Outline of highmem and common issues.
|
- Outline of highmem and common issues.
|
||||||
hugetlbpage.txt
|
hugetlbpage.txt
|
||||||
- a brief summary of hugetlbpage support in the Linux kernel.
|
- a brief summary of hugetlbpage support in the Linux kernel.
|
||||||
|
hugetlbfs_reserv.txt
|
||||||
|
- A brief overview of hugetlbfs reservation design/implementation.
|
||||||
hwpoison.txt
|
hwpoison.txt
|
||||||
- explains what hwpoison is
|
- explains what hwpoison is
|
||||||
idle_page_tracking.txt
|
idle_page_tracking.txt
|
||||||
|
|
529
Documentation/vm/hugetlbfs_reserv.txt
Normal file
529
Documentation/vm/hugetlbfs_reserv.txt
Normal file
|
@ -0,0 +1,529 @@
|
||||||
|
Hugetlbfs Reservation Overview
|
||||||
|
------------------------------
|
||||||
|
Huge pages as described at 'Documentation/vm/hugetlbpage.txt' are typically
|
||||||
|
preallocated for application use. These huge pages are instantiated in a
|
||||||
|
task's address space at page fault time if the VMA indicates huge pages are
|
||||||
|
to be used. If no huge page exists at page fault time, the task is sent
|
||||||
|
a SIGBUS and often dies an unhappy death. Shortly after huge page support
|
||||||
|
was added, it was determined that it would be better to detect a shortage
|
||||||
|
of huge pages at mmap() time. The idea is that if there were not enough
|
||||||
|
huge pages to cover the mapping, the mmap() would fail. This was first
|
||||||
|
done with a simple check in the code at mmap() time to determine if there
|
||||||
|
were enough free huge pages to cover the mapping. Like most things in the
|
||||||
|
kernel, the code has evolved over time. However, the basic idea was to
|
||||||
|
'reserve' huge pages at mmap() time to ensure that huge pages would be
|
||||||
|
available for page faults in that mapping. The description below attempts to
|
||||||
|
describe how huge page reserve processing is done in the v4.10 kernel.
|
||||||
|
|
||||||
|
|
||||||
|
Audience
|
||||||
|
--------
|
||||||
|
This description is primarily targeted at kernel developers who are modifying
|
||||||
|
hugetlbfs code.
|
||||||
|
|
||||||
|
|
||||||
|
The Data Structures
|
||||||
|
-------------------
|
||||||
|
resv_huge_pages
|
||||||
|
This is a global (per-hstate) count of reserved huge pages. Reserved
|
||||||
|
huge pages are only available to the task which reserved them.
|
||||||
|
Therefore, the number of huge pages generally available is computed
|
||||||
|
as (free_huge_pages - resv_huge_pages).
|
||||||
|
Reserve Map
|
||||||
|
A reserve map is described by the structure:
|
||||||
|
struct resv_map {
|
||||||
|
struct kref refs;
|
||||||
|
spinlock_t lock;
|
||||||
|
struct list_head regions;
|
||||||
|
long adds_in_progress;
|
||||||
|
struct list_head region_cache;
|
||||||
|
long region_cache_count;
|
||||||
|
};
|
||||||
|
There is one reserve map for each huge page mapping in the system.
|
||||||
|
The regions list within the resv_map describes the regions within
|
||||||
|
the mapping. A region is described as:
|
||||||
|
struct file_region {
|
||||||
|
struct list_head link;
|
||||||
|
long from;
|
||||||
|
long to;
|
||||||
|
};
|
||||||
|
The 'from' and 'to' fields of the file region structure are huge page
|
||||||
|
indices into the mapping. Depending on the type of mapping, a
|
||||||
|
region in the reserv_map may indicate reservations exist for the
|
||||||
|
range, or reservations do not exist.
|
||||||
|
Flags for MAP_PRIVATE Reservations
|
||||||
|
These are stored in the bottom bits of the reservation map pointer.
|
||||||
|
#define HPAGE_RESV_OWNER (1UL << 0) Indicates this task is the
|
||||||
|
owner of the reservations associated with the mapping.
|
||||||
|
#define HPAGE_RESV_UNMAPPED (1UL << 1) Indicates task originally
|
||||||
|
mapping this range (and creating reserves) has unmapped a
|
||||||
|
page from this task (the child) due to a failed COW.
|
||||||
|
Page Flags
|
||||||
|
The PagePrivate page flag is used to indicate that a huge page
|
||||||
|
reservation must be restored when the huge page is freed. More
|
||||||
|
details will be discussed in the "Freeing huge pages" section.
|
||||||
|
|
||||||
|
|
||||||
|
Reservation Map Location (Private or Shared)
|
||||||
|
--------------------------------------------
|
||||||
|
A huge page mapping or segment is either private or shared. If private,
|
||||||
|
it is typically only available to a single address space (task). If shared,
|
||||||
|
it can be mapped into multiple address spaces (tasks). The location and
|
||||||
|
semantics of the reservation map is significantly different for two types
|
||||||
|
of mappings. Location differences are:
|
||||||
|
- For private mappings, the reservation map hangs off the the VMA structure.
|
||||||
|
Specifically, vma->vm_private_data. This reserve map is created at the
|
||||||
|
time the mapping (mmap(MAP_PRIVATE)) is created.
|
||||||
|
- For shared mappings, the reservation map hangs off the inode. Specifically,
|
||||||
|
inode->i_mapping->private_data. Since shared mappings are always backed
|
||||||
|
by files in the hugetlbfs filesystem, the hugetlbfs code ensures each inode
|
||||||
|
contains a reservation map. As a result, the reservation map is allocated
|
||||||
|
when the inode is created.
|
||||||
|
|
||||||
|
|
||||||
|
Creating Reservations
|
||||||
|
---------------------
|
||||||
|
Reservations are created when a huge page backed shared memory segment is
|
||||||
|
created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB).
|
||||||
|
These operations result in a call to the routine hugetlb_reserve_pages()
|
||||||
|
|
||||||
|
int hugetlb_reserve_pages(struct inode *inode,
|
||||||
|
long from, long to,
|
||||||
|
struct vm_area_struct *vma,
|
||||||
|
vm_flags_t vm_flags)
|
||||||
|
|
||||||
|
The first thing hugetlb_reserve_pages() does is check for the NORESERVE
|
||||||
|
flag was specified in either the shmget() or mmap() call. If NORESERVE
|
||||||
|
was specified, then this routine returns immediately as no reservation
|
||||||
|
are desired.
|
||||||
|
|
||||||
|
The arguments 'from' and 'to' are huge page indices into the mapping or
|
||||||
|
underlying file. For shmget(), 'from' is always 0 and 'to' corresponds to
|
||||||
|
the length of the segment/mapping. For mmap(), the offset argument could
|
||||||
|
be used to specify the offset into the underlying file. In such a case
|
||||||
|
the 'from' and 'to' arguments have been adjusted by this offset.
|
||||||
|
|
||||||
|
One of the big differences between PRIVATE and SHARED mappings is the way
|
||||||
|
in which reservations are represented in the reservation map.
|
||||||
|
- For shared mappings, an entry in the reservation map indicates a reservation
|
||||||
|
exists or did exist for the corresponding page. As reservations are
|
||||||
|
consumed, the reservation map is not modified.
|
||||||
|
- For private mappings, the lack of an entry in the reservation map indicates
|
||||||
|
a reservation exists for the corresponding page. As reservations are
|
||||||
|
consumed, entries are added to the reservation map. Therefore, the
|
||||||
|
reservation map can also be used to determine which reservations have
|
||||||
|
been consumed.
|
||||||
|
|
||||||
|
For private mappings, hugetlb_reserve_pages() creates the reservation map and
|
||||||
|
hangs it off the VMA structure. In addition, the HPAGE_RESV_OWNER flag is set
|
||||||
|
to indicate this VMA owns the reservations.
|
||||||
|
|
||||||
|
The reservation map is consulted to determine how many huge page reservations
|
||||||
|
are needed for the current mapping/segment. For private mappings, this is
|
||||||
|
always the value (to - from). However, for shared mappings it is possible that some reservations may already exist within the range (to - from). See the
|
||||||
|
section "Reservation Map Modifications" for details on how this is accomplished.
|
||||||
|
|
||||||
|
The mapping may be associated with a subpool. If so, the subpool is consulted
|
||||||
|
to ensure there is sufficient space for the mapping. It is possible that the
|
||||||
|
subpool has set aside reservations that can be used for the mapping. See the
|
||||||
|
section "Subpool Reservations" for more details.
|
||||||
|
|
||||||
|
After consulting the reservation map and subpool, the number of needed new
|
||||||
|
reservations is known. The routine hugetlb_acct_memory() is called to check
|
||||||
|
for and take the requested number of reservations. hugetlb_acct_memory()
|
||||||
|
calls into routines that potentially allocate and adjust surplus page counts.
|
||||||
|
However, within those routines the code is simply checking to ensure there
|
||||||
|
are enough free huge pages to accommodate the reservation. If there are,
|
||||||
|
the global reservation count resv_huge_pages is adjusted something like the
|
||||||
|
following.
|
||||||
|
if (resv_needed <= (resv_huge_pages - free_huge_pages))
|
||||||
|
resv_huge_pages += resv_needed;
|
||||||
|
Note that the global lock hugetlb_lock is held when checking and adjusting
|
||||||
|
these counters.
|
||||||
|
|
||||||
|
If there were enough free huge pages and the global count resv_huge_pages
|
||||||
|
was adjusted, then the reservation map associated with the mapping is
|
||||||
|
modified to reflect the reservations. In the case of a shared mapping, a
|
||||||
|
file_region will exist that includes the range 'from' 'to'. For private
|
||||||
|
mappings, no modifications are made to the reservation map as lack of an
|
||||||
|
entry indicates a reservation exists.
|
||||||
|
|
||||||
|
If hugetlb_reserve_pages() was successful, the global reservation count and
|
||||||
|
reservation map associated with the mapping will be modified as required to
|
||||||
|
ensure reservations exist for the range 'from' - 'to'.
|
||||||
|
|
||||||
|
|
||||||
|
Consuming Reservations/Allocating a Huge Page
|
||||||
|
---------------------------------------------
|
||||||
|
Reservations are consumed when huge pages associated with the reservations
|
||||||
|
are allocated and instantiated in the corresponding mapping. The allocation
|
||||||
|
is performed within the routine alloc_huge_page().
|
||||||
|
struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||||
|
unsigned long addr, int avoid_reserve)
|
||||||
|
alloc_huge_page is passed a VMA pointer and a virtual address, so it can
|
||||||
|
consult the reservation map to determine if a reservation exists. In addition,
|
||||||
|
alloc_huge_page takes the argument avoid_reserve which indicates reserves
|
||||||
|
should not be used even if it appears they have been set aside for the
|
||||||
|
specified address. The avoid_reserve argument is most often used in the case
|
||||||
|
of Copy on Write and Page Migration where additional copies of an existing
|
||||||
|
page are being allocated.
|
||||||
|
|
||||||
|
The helper routine vma_needs_reservation() is called to determine if a
|
||||||
|
reservation exists for the address within the mapping(vma). See the section
|
||||||
|
"Reservation Map Helper Routines" for detailed information on what this
|
||||||
|
routine does. The value returned from vma_needs_reservation() is generally
|
||||||
|
0 or 1. 0 if a reservation exists for the address, 1 if no reservation exists.
|
||||||
|
If a reservation does not exist, and there is a subpool associated with the
|
||||||
|
mapping the subpool is consulted to determine if it contains reservations.
|
||||||
|
If the subpool contains reservations, one can be used for this allocation.
|
||||||
|
However, in every case the avoid_reserve argument overrides the use of
|
||||||
|
a reservation for the allocation. After determining whether a reservation
|
||||||
|
exists and can be used for the allocation, the routine dequeue_huge_page_vma()
|
||||||
|
is called. This routine takes two arguments related to reservations:
|
||||||
|
- avoid_reserve, this is the same value/argument passed to alloc_huge_page()
|
||||||
|
- chg, even though this argument is of type long only the values 0 or 1 are
|
||||||
|
passed to dequeue_huge_page_vma. If the value is 0, it indicates a
|
||||||
|
reservation exists (see the section "Memory Policy and Reservations" for
|
||||||
|
possible issues). If the value is 1, it indicates a reservation does not
|
||||||
|
exist and the page must be taken from the global free pool if possible.
|
||||||
|
The free lists associated with the memory policy of the VMA are searched for
|
||||||
|
a free page. If a page is found, the value free_huge_pages is decremented
|
||||||
|
when the page is removed from the free list. If there was a reservation
|
||||||
|
associated with the page, the following adjustments are made:
|
||||||
|
SetPagePrivate(page); /* Indicates allocating this page consumed
|
||||||
|
* a reservation, and if an error is
|
||||||
|
* encountered such that the page must be
|
||||||
|
* freed, the reservation will be restored. */
|
||||||
|
resv_huge_pages--; /* Decrement the global reservation count */
|
||||||
|
Note, if no huge page can be found that satisfies the VMA's memory policy
|
||||||
|
an attempt will be made to allocate one using the buddy allocator. This
|
||||||
|
brings up the issue of surplus huge pages and overcommit which is beyond
|
||||||
|
the scope reservations. Even if a surplus page is allocated, the same
|
||||||
|
reservation based adjustments as above will be made: SetPagePrivate(page) and
|
||||||
|
resv_huge_pages--.
|
||||||
|
|
||||||
|
After obtaining a new huge page, (page)->private is set to the value of
|
||||||
|
the subpool associated with the page if it exists. This will be used for
|
||||||
|
subpool accounting when the page is freed.
|
||||||
|
|
||||||
|
The routine vma_commit_reservation() is then called to adjust the reserve
|
||||||
|
map based on the consumption of the reservation. In general, this involves
|
||||||
|
ensuring the page is represented within a file_region structure of the region
|
||||||
|
map. For shared mappings where the the reservation was present, an entry
|
||||||
|
in the reserve map already existed so no change is made. However, if there
|
||||||
|
was no reservation in a shared mapping or this was a private mapping a new
|
||||||
|
entry must be created.
|
||||||
|
|
||||||
|
It is possible that the reserve map could have been changed between the call
|
||||||
|
to vma_needs_reservation() at the beginning of alloc_huge_page() and the
|
||||||
|
call to vma_commit_reservation() after the page was allocated. This would
|
||||||
|
be possible if hugetlb_reserve_pages was called for the same page in a shared
|
||||||
|
mapping. In such cases, the reservation count and subpool free page count
|
||||||
|
will be off by one. This rare condition can be identified by comparing the
|
||||||
|
return value from vma_needs_reservation and vma_commit_reservation. If such
|
||||||
|
a race is detected, the subpool and global reserve counts are adjusted to
|
||||||
|
compensate. See the section "Reservation Map Helper Routines" for more
|
||||||
|
information on these routines.
|
||||||
|
|
||||||
|
|
||||||
|
Instantiate Huge Pages
|
||||||
|
----------------------
|
||||||
|
After huge page allocation, the page is typically added to the page tables
|
||||||
|
of the allocating task. Before this, pages in a shared mapping are added
|
||||||
|
to the page cache and pages in private mappings are added to an anonymous
|
||||||
|
reverse mapping. In both cases, the PagePrivate flag is cleared. Therefore,
|
||||||
|
when a huge page that has been instantiated is freed no adjustment is made
|
||||||
|
to the global reservation count (resv_huge_pages).
|
||||||
|
|
||||||
|
|
||||||
|
Freeing Huge Pages
|
||||||
|
------------------
|
||||||
|
Huge page freeing is performed by the routine free_huge_page(). This routine
|
||||||
|
is the destructor for hugetlbfs compound pages. As a result, it is only
|
||||||
|
passed a pointer to the page struct. When a huge page is freed, reservation
|
||||||
|
accounting may need to be performed. This would be the case if the page was
|
||||||
|
associated with a subpool that contained reserves, or the page is being freed
|
||||||
|
on an error path where a global reserve count must be restored.
|
||||||
|
|
||||||
|
The page->private field points to any subpool associated with the page.
|
||||||
|
If the PagePrivate flag is set, it indicates the global reserve count should
|
||||||
|
be adjusted (see the section "Consuming Reservations/Allocating a Huge Page"
|
||||||
|
for information on how these are set).
|
||||||
|
|
||||||
|
The routine first calls hugepage_subpool_put_pages() for the page. If this
|
||||||
|
routine returns a value of 0 (which does not equal the value passed 1) it
|
||||||
|
indicates reserves are associated with the subpool, and this newly free page
|
||||||
|
must be used to keep the number of subpool reserves above the minimum size.
|
||||||
|
Therefore, the global resv_huge_pages counter is incremented in this case.
|
||||||
|
|
||||||
|
If the PagePrivate flag was set in the page, the global resv_huge_pages counter
|
||||||
|
will always be incremented.
|
||||||
|
|
||||||
|
|
||||||
|
Subpool Reservations
|
||||||
|
--------------------
|
||||||
|
There is a struct hstate associated with each huge page size. The hstate
|
||||||
|
tracks all huge pages of the specified size. A subpool represents a subset
|
||||||
|
of pages within a hstate that is associated with a mounted hugetlbfs
|
||||||
|
filesystem.
|
||||||
|
|
||||||
|
When a hugetlbfs filesystem is mounted a min_size option can be specified
|
||||||
|
which indicates the minimum number of huge pages required by the filesystem.
|
||||||
|
If this option is specified, the number of huge pages corresponding to
|
||||||
|
min_size are reserved for use by the filesystem. This number is tracked in
|
||||||
|
the min_hpages field of a struct hugepage_subpool. At mount time,
|
||||||
|
hugetlb_acct_memory(min_hpages) is called to reserve the specified number of
|
||||||
|
huge pages. If they can not be reserved, the mount fails.
|
||||||
|
|
||||||
|
The routines hugepage_subpool_get/put_pages() are called when pages are
|
||||||
|
obtained from or released back to a subpool. They perform all subpool
|
||||||
|
accounting, and track any reservations associated with the subpool.
|
||||||
|
hugepage_subpool_get/put_pages are passed the number of huge pages by which
|
||||||
|
to adjust the subpool 'used page' count (down for get, up for put). Normally,
|
||||||
|
they return the same value that was passed or an error if not enough pages
|
||||||
|
exist in the subpool.
|
||||||
|
|
||||||
|
However, if reserves are associated with the subpool a return value less
|
||||||
|
than the passed value may be returned. This return value indicates the
|
||||||
|
number of additional global pool adjustments which must be made. For example,
|
||||||
|
suppose a subpool contains 3 reserved huge pages and someone asks for 5.
|
||||||
|
The 3 reserved pages associated with the subpool can be used to satisfy part
|
||||||
|
of the request. But, 2 pages must be obtained from the global pools. To
|
||||||
|
relay this information to the caller, the value 2 is returned. The caller
|
||||||
|
is then responsible for attempting to obtain the additional two pages from
|
||||||
|
the global pools.
|
||||||
|
|
||||||
|
|
||||||
|
COW and Reservations
|
||||||
|
--------------------
|
||||||
|
Since shared mappings all point to and use the same underlying pages, the
|
||||||
|
biggest reservation concern for COW is private mappings. In this case,
|
||||||
|
two tasks can be pointing at the same previously allocated page. One task
|
||||||
|
attempts to write to the page, so a new page must be allocated so that each
|
||||||
|
task points to its own page.
|
||||||
|
|
||||||
|
When the page was originally allocated, the reservation for that page was
|
||||||
|
consumed. When an attempt to allocate a new page is made as a result of
|
||||||
|
COW, it is possible that no free huge pages are free and the allocation
|
||||||
|
will fail.
|
||||||
|
|
||||||
|
When the private mapping was originally created, the owner of the mapping
|
||||||
|
was noted by setting the HPAGE_RESV_OWNER bit in the pointer to the reservation
|
||||||
|
map of the owner. Since the owner created the mapping, the owner owns all
|
||||||
|
the reservations associated with the mapping. Therefore, when a write fault
|
||||||
|
occurs and there is no page available, different action is taken for the owner
|
||||||
|
and non-owner of the reservation.
|
||||||
|
|
||||||
|
In the case where the faulting task is not the owner, the fault will fail and
|
||||||
|
the task will typically receive a SIGBUS.
|
||||||
|
|
||||||
|
If the owner is the faulting task, we want it to succeed since it owned the
|
||||||
|
original reservation. To accomplish this, the page is unmapped from the
|
||||||
|
non-owning task. In this way, the only reference is from the owning task.
|
||||||
|
In addition, the HPAGE_RESV_UNMAPPED bit is set in the reservation map pointer
|
||||||
|
of the non-owning task. The non-owning task may receive a SIGBUS if it later
|
||||||
|
faults on a non-present page. But, the original owner of the
|
||||||
|
mapping/reservation will behave as expected.
|
||||||
|
|
||||||
|
|
||||||
|
Reservation Map Modifications
|
||||||
|
-----------------------------
|
||||||
|
The following low level routines are used to make modifications to a
|
||||||
|
reservation map. Typically, these routines are not called directly. Rather,
|
||||||
|
a reservation map helper routine is called which calls one of these low level
|
||||||
|
routines. These low level routines are fairly well documented in the source
|
||||||
|
code (mm/hugetlb.c). These routines are:
|
||||||
|
long region_chg(struct resv_map *resv, long f, long t);
|
||||||
|
long region_add(struct resv_map *resv, long f, long t);
|
||||||
|
void region_abort(struct resv_map *resv, long f, long t);
|
||||||
|
long region_count(struct resv_map *resv, long f, long t);
|
||||||
|
|
||||||
|
Operations on the reservation map typically involve two operations:
|
||||||
|
1) region_chg() is called to examine the reserve map and determine how
|
||||||
|
many pages in the specified range [f, t) are NOT currently represented.
|
||||||
|
|
||||||
|
The calling code performs global checks and allocations to determine if
|
||||||
|
there are enough huge pages for the operation to succeed.
|
||||||
|
|
||||||
|
2a) If the operation can succeed, region_add() is called to actually modify
|
||||||
|
the reservation map for the same range [f, t) previously passed to
|
||||||
|
region_chg().
|
||||||
|
2b) If the operation can not succeed, region_abort is called for the same range
|
||||||
|
[f, t) to abort the operation.
|
||||||
|
|
||||||
|
Note that this is a two step process where region_add() and region_abort()
|
||||||
|
are guaranteed to succeed after a prior call to region_chg() for the same
|
||||||
|
range. region_chg() is responsible for pre-allocating any data structures
|
||||||
|
necessary to ensure the subsequent operations (specifically region_add()))
|
||||||
|
will succeed.
|
||||||
|
|
||||||
|
As mentioned above, region_chg() determines the number of pages in the range
|
||||||
|
which are NOT currently represented in the map. This number is returned to
|
||||||
|
the caller. region_add() returns the number of pages in the range added to
|
||||||
|
the map. In most cases, the return value of region_add() is the same as the
|
||||||
|
return value of region_chg(). However, in the case of shared mappings it is
|
||||||
|
possible for changes to the reservation map to be made between the calls to
|
||||||
|
region_chg() and region_add(). In this case, the return value of region_add()
|
||||||
|
will not match the return value of region_chg(). It is likely that in such
|
||||||
|
cases global counts and subpool accounting will be incorrect and in need of
|
||||||
|
adjustment. It is the responsibility of the caller to check for this condition
|
||||||
|
and make the appropriate adjustments.
|
||||||
|
|
||||||
|
The routine region_del() is called to remove regions from a reservation map.
|
||||||
|
It is typically called in the following situations:
|
||||||
|
- When a file in the hugetlbfs filesystem is being removed, the inode will
|
||||||
|
be released and the reservation map freed. Before freeing the reservation
|
||||||
|
map, all the individual file_region structures must be freed. In this case
|
||||||
|
region_del is passed the range [0, LONG_MAX).
|
||||||
|
- When a hugetlbfs file is being truncated. In this case, all allocated pages
|
||||||
|
after the new file size must be freed. In addition, any file_region entries
|
||||||
|
in the reservation map past the new end of file must be deleted. In this
|
||||||
|
case, region_del is passed the range [new_end_of_file, LONG_MAX).
|
||||||
|
- When a hole is being punched in a hugetlbfs file. In this case, huge pages
|
||||||
|
are removed from the middle of the file one at a time. As the pages are
|
||||||
|
removed, region_del() is called to remove the corresponding entry from the
|
||||||
|
reservation map. In this case, region_del is passed the range
|
||||||
|
[page_idx, page_idx + 1).
|
||||||
|
In every case, region_del() will return the number of pages removed from the
|
||||||
|
reservation map. In VERY rare cases, region_del() can fail. This can only
|
||||||
|
happen in the hole punch case where it has to split an existing file_region
|
||||||
|
entry and can not allocate a new structure. In this error case, region_del()
|
||||||
|
will return -ENOMEM. The problem here is that the reservation map will
|
||||||
|
indicate that there is a reservation for the page. However, the subpool and
|
||||||
|
global reservation counts will not reflect the reservation. To handle this
|
||||||
|
situation, the routine hugetlb_fix_reserve_counts() is called to adjust the
|
||||||
|
counters so that they correspond with the reservation map entry that could
|
||||||
|
not be deleted.
|
||||||
|
|
||||||
|
region_count() is called when unmapping a private huge page mapping. In
|
||||||
|
private mappings, the lack of a entry in the reservation map indicates that
|
||||||
|
a reservation exists. Therefore, by counting the number of entries in the
|
||||||
|
reservation map we know how many reservations were consumed and how many are
|
||||||
|
outstanding (outstanding = (end - start) - region_count(resv, start, end)).
|
||||||
|
Since the mapping is going away, the subpool and global reservation counts
|
||||||
|
are decremented by the number of outstanding reservations.
|
||||||
|
|
||||||
|
|
||||||
|
Reservation Map Helper Routines
|
||||||
|
-------------------------------
|
||||||
|
Several helper routines exist to query and modify the reservation maps.
|
||||||
|
These routines are only interested with reservations for a specific huge
|
||||||
|
page, so they just pass in an address instead of a range. In addition,
|
||||||
|
they pass in the associated VMA. From the VMA, the type of mapping (private
|
||||||
|
or shared) and the location of the reservation map (inode or VMA) can be
|
||||||
|
determined. These routines simply call the underlying routines described
|
||||||
|
in the section "Reservation Map Modifications". However, they do take into
|
||||||
|
account the 'opposite' meaning of reservation map entries for private and
|
||||||
|
shared mappings and hide this detail from the caller.
|
||||||
|
|
||||||
|
long vma_needs_reservation(struct hstate *h,
|
||||||
|
struct vm_area_struct *vma, unsigned long addr)
|
||||||
|
This routine calls region_chg() for the specified page. If no reservation
|
||||||
|
exists, 1 is returned. If a reservation exists, 0 is returned.
|
||||||
|
|
||||||
|
long vma_commit_reservation(struct hstate *h,
|
||||||
|
struct vm_area_struct *vma, unsigned long addr)
|
||||||
|
This calls region_add() for the specified page. As in the case of region_chg
|
||||||
|
and region_add, this routine is to be called after a previous call to
|
||||||
|
vma_needs_reservation. It will add a reservation entry for the page. It
|
||||||
|
returns 1 if the reservation was added and 0 if not. The return value should
|
||||||
|
be compared with the return value of the previous call to
|
||||||
|
vma_needs_reservation. An unexpected difference indicates the reservation
|
||||||
|
map was modified between calls.
|
||||||
|
|
||||||
|
void vma_end_reservation(struct hstate *h,
|
||||||
|
struct vm_area_struct *vma, unsigned long addr)
|
||||||
|
This calls region_abort() for the specified page. As in the case of region_chg
|
||||||
|
and region_abort, this routine is to be called after a previous call to
|
||||||
|
vma_needs_reservation. It will abort/end the in progress reservation add
|
||||||
|
operation.
|
||||||
|
|
||||||
|
long vma_add_reservation(struct hstate *h,
|
||||||
|
struct vm_area_struct *vma, unsigned long addr)
|
||||||
|
This is a special wrapper routine to help facilitate reservation cleanup
|
||||||
|
on error paths. It is only called from the routine restore_reserve_on_error().
|
||||||
|
This routine is used in conjunction with vma_needs_reservation in an attempt
|
||||||
|
to add a reservation to the reservation map. It takes into account the
|
||||||
|
different reservation map semantics for private and shared mappings. Hence,
|
||||||
|
region_add is called for shared mappings (as an entry present in the map
|
||||||
|
indicates a reservation), and region_del is called for private mappings (as
|
||||||
|
the absence of an entry in the map indicates a reservation). See the section
|
||||||
|
"Reservation cleanup in error paths" for more information on what needs to
|
||||||
|
be done on error paths.
|
||||||
|
|
||||||
|
|
||||||
|
Reservation Cleanup in Error Paths
|
||||||
|
----------------------------------
|
||||||
|
As mentioned in the section "Reservation Map Helper Routines", reservation
|
||||||
|
map modifications are performed in two steps. First vma_needs_reservation
|
||||||
|
is called before a page is allocated. If the allocation is successful,
|
||||||
|
then vma_commit_reservation is called. If not, vma_end_reservation is called.
|
||||||
|
Global and subpool reservation counts are adjusted based on success or failure
|
||||||
|
of the operation and all is well.
|
||||||
|
|
||||||
|
Additionally, after a huge page is instantiated the PagePrivate flag is
|
||||||
|
cleared so that accounting when the page is ultimately freed is correct.
|
||||||
|
|
||||||
|
However, there are several instances where errors are encountered after a huge
|
||||||
|
page is allocated but before it is instantiated. In this case, the page
|
||||||
|
allocation has consumed the reservation and made the appropriate subpool,
|
||||||
|
reservation map and global count adjustments. If the page is freed at this
|
||||||
|
time (before instantiation and clearing of PagePrivate), then free_huge_page
|
||||||
|
will increment the global reservation count. However, the reservation map
|
||||||
|
indicates the reservation was consumed. This resulting inconsistent state
|
||||||
|
will cause the 'leak' of a reserved huge page. The global reserve count will
|
||||||
|
be higher than it should and prevent allocation of a pre-allocated page.
|
||||||
|
|
||||||
|
The routine restore_reserve_on_error() attempts to handle this situation. It
|
||||||
|
is fairly well documented. The intention of this routine is to restore
|
||||||
|
the reservation map to the way it was before the page allocation. In this
|
||||||
|
way, the state of the reservation map will correspond to the global reservation
|
||||||
|
count after the page is freed.
|
||||||
|
|
||||||
|
The routine restore_reserve_on_error itself may encounter errors while
|
||||||
|
attempting to restore the reservation map entry. In this case, it will
|
||||||
|
simply clear the PagePrivate flag of the page. In this way, the global
|
||||||
|
reserve count will not be incremented when the page is freed. However, the
|
||||||
|
reservation map will continue to look as though the reservation was consumed.
|
||||||
|
A page can still be allocated for the address, but it will not use a reserved
|
||||||
|
page as originally intended.
|
||||||
|
|
||||||
|
There is some code (most notably userfaultfd) which can not call
|
||||||
|
restore_reserve_on_error. In this case, it simply modifies the PagePrivate
|
||||||
|
so that a reservation will not be leaked when the huge page is freed.
|
||||||
|
|
||||||
|
|
||||||
|
Reservations and Memory Policy
|
||||||
|
------------------------------
|
||||||
|
Per-node huge page lists existed in struct hstate when git was first used
|
||||||
|
to manage Linux code. The concept of reservations was added some time later.
|
||||||
|
When reservations were added, no attempt was made to take memory policy
|
||||||
|
into account. While cpusets are not exactly the same as memory policy, this
|
||||||
|
comment in hugetlb_acct_memory sums up the interaction between reservations
|
||||||
|
and cpusets/memory policy.
|
||||||
|
/*
|
||||||
|
* When cpuset is configured, it breaks the strict hugetlb page
|
||||||
|
* reservation as the accounting is done on a global variable. Such
|
||||||
|
* reservation is completely rubbish in the presence of cpuset because
|
||||||
|
* the reservation is not checked against page availability for the
|
||||||
|
* current cpuset. Application can still potentially OOM'ed by kernel
|
||||||
|
* with lack of free htlb page in cpuset that the task is in.
|
||||||
|
* Attempt to enforce strict accounting with cpuset is almost
|
||||||
|
* impossible (or too ugly) because cpuset is too fluid that
|
||||||
|
* task or memory node can be dynamically moved between cpusets.
|
||||||
|
*
|
||||||
|
* The change of semantics for shared hugetlb mapping with cpuset is
|
||||||
|
* undesirable. However, in order to preserve some of the semantics,
|
||||||
|
* we fall back to check against current free page availability as
|
||||||
|
* a best attempt and hopefully to minimize the impact of changing
|
||||||
|
* semantics that cpuset has.
|
||||||
|
*/
|
||||||
|
|
||||||
|
Huge page reservations were added to prevent unexpected page allocation
|
||||||
|
failures (OOM) at page fault time. However, if an application makes use
|
||||||
|
of cpusets or memory policy there is no guarantee that huge pages will be
|
||||||
|
available on the required nodes. This is true even if there are a sufficient
|
||||||
|
number of global reservations.
|
||||||
|
|
||||||
|
|
||||||
|
Mike Kravetz, 7 April 2017
|
|
@ -97,6 +97,9 @@ EXPORT_SYMBOL(clk_enable);
|
||||||
|
|
||||||
void clk_disable(struct clk *clk)
|
void clk_disable(struct clk *clk)
|
||||||
{
|
{
|
||||||
|
if (!clk)
|
||||||
|
return;
|
||||||
|
|
||||||
if (clk->ops && clk->ops->disable)
|
if (clk->ops && clk->ops->disable)
|
||||||
clk->ops->disable(clk);
|
clk->ops->disable(clk);
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,6 +45,8 @@ static const char *default_compressor = "lzo";
|
||||||
/* Module params (documentation at end) */
|
/* Module params (documentation at end) */
|
||||||
static unsigned int num_devices = 1;
|
static unsigned int num_devices = 1;
|
||||||
|
|
||||||
|
static void zram_free_page(struct zram *zram, size_t index);
|
||||||
|
|
||||||
static inline bool init_done(struct zram *zram)
|
static inline bool init_done(struct zram *zram)
|
||||||
{
|
{
|
||||||
return zram->disksize;
|
return zram->disksize;
|
||||||
|
@ -55,53 +57,70 @@ static inline struct zram *dev_to_zram(struct device *dev)
|
||||||
return (struct zram *)dev_to_disk(dev)->private_data;
|
return (struct zram *)dev_to_disk(dev)->private_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static unsigned long zram_get_handle(struct zram *zram, u32 index)
|
||||||
|
{
|
||||||
|
return zram->table[index].handle;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
|
||||||
|
{
|
||||||
|
zram->table[index].handle = handle;
|
||||||
|
}
|
||||||
|
|
||||||
/* flag operations require table entry bit_spin_lock() being held */
|
/* flag operations require table entry bit_spin_lock() being held */
|
||||||
static int zram_test_flag(struct zram_meta *meta, u32 index,
|
static int zram_test_flag(struct zram *zram, u32 index,
|
||||||
enum zram_pageflags flag)
|
enum zram_pageflags flag)
|
||||||
{
|
{
|
||||||
return meta->table[index].value & BIT(flag);
|
return zram->table[index].value & BIT(flag);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void zram_set_flag(struct zram_meta *meta, u32 index,
|
static void zram_set_flag(struct zram *zram, u32 index,
|
||||||
enum zram_pageflags flag)
|
enum zram_pageflags flag)
|
||||||
{
|
{
|
||||||
meta->table[index].value |= BIT(flag);
|
zram->table[index].value |= BIT(flag);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void zram_clear_flag(struct zram_meta *meta, u32 index,
|
static void zram_clear_flag(struct zram *zram, u32 index,
|
||||||
enum zram_pageflags flag)
|
enum zram_pageflags flag)
|
||||||
{
|
{
|
||||||
meta->table[index].value &= ~BIT(flag);
|
zram->table[index].value &= ~BIT(flag);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void zram_set_element(struct zram_meta *meta, u32 index,
|
static inline void zram_set_element(struct zram *zram, u32 index,
|
||||||
unsigned long element)
|
unsigned long element)
|
||||||
{
|
{
|
||||||
meta->table[index].element = element;
|
zram->table[index].element = element;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void zram_clear_element(struct zram_meta *meta, u32 index)
|
static unsigned long zram_get_element(struct zram *zram, u32 index)
|
||||||
{
|
{
|
||||||
meta->table[index].element = 0;
|
return zram->table[index].element;
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t zram_get_obj_size(struct zram_meta *meta, u32 index)
|
static size_t zram_get_obj_size(struct zram *zram, u32 index)
|
||||||
{
|
{
|
||||||
return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
|
return zram->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void zram_set_obj_size(struct zram_meta *meta,
|
static void zram_set_obj_size(struct zram *zram,
|
||||||
u32 index, size_t size)
|
u32 index, size_t size)
|
||||||
{
|
{
|
||||||
unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT;
|
unsigned long flags = zram->table[index].value >> ZRAM_FLAG_SHIFT;
|
||||||
|
|
||||||
meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
|
zram->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if PAGE_SIZE != 4096
|
||||||
static inline bool is_partial_io(struct bio_vec *bvec)
|
static inline bool is_partial_io(struct bio_vec *bvec)
|
||||||
{
|
{
|
||||||
return bvec->bv_len != PAGE_SIZE;
|
return bvec->bv_len != PAGE_SIZE;
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
static inline bool is_partial_io(struct bio_vec *bvec)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static void zram_revalidate_disk(struct zram *zram)
|
static void zram_revalidate_disk(struct zram *zram)
|
||||||
{
|
{
|
||||||
|
@ -137,8 +156,7 @@ static inline bool valid_io_request(struct zram *zram,
|
||||||
|
|
||||||
static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
|
static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
|
||||||
{
|
{
|
||||||
if (*offset + bvec->bv_len >= PAGE_SIZE)
|
*index += (*offset + bvec->bv_len) / PAGE_SIZE;
|
||||||
(*index)++;
|
|
||||||
*offset = (*offset + bvec->bv_len) % PAGE_SIZE;
|
*offset = (*offset + bvec->bv_len) % PAGE_SIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -177,31 +195,21 @@ static bool page_same_filled(void *ptr, unsigned long *element)
|
||||||
{
|
{
|
||||||
unsigned int pos;
|
unsigned int pos;
|
||||||
unsigned long *page;
|
unsigned long *page;
|
||||||
|
unsigned long val;
|
||||||
|
|
||||||
page = (unsigned long *)ptr;
|
page = (unsigned long *)ptr;
|
||||||
|
val = page[0];
|
||||||
|
|
||||||
for (pos = 0; pos < PAGE_SIZE / sizeof(*page) - 1; pos++) {
|
for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) {
|
||||||
if (page[pos] != page[pos + 1])
|
if (val != page[pos])
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
*element = page[pos];
|
*element = val;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void handle_same_page(struct bio_vec *bvec, unsigned long element)
|
|
||||||
{
|
|
||||||
struct page *page = bvec->bv_page;
|
|
||||||
void *user_mem;
|
|
||||||
|
|
||||||
user_mem = kmap_atomic(page);
|
|
||||||
zram_fill_page(user_mem + bvec->bv_offset, bvec->bv_len, element);
|
|
||||||
kunmap_atomic(user_mem);
|
|
||||||
|
|
||||||
flush_dcache_page(page);
|
|
||||||
}
|
|
||||||
|
|
||||||
static ssize_t initstate_show(struct device *dev,
|
static ssize_t initstate_show(struct device *dev,
|
||||||
struct device_attribute *attr, char *buf)
|
struct device_attribute *attr, char *buf)
|
||||||
{
|
{
|
||||||
|
@ -254,9 +262,8 @@ static ssize_t mem_used_max_store(struct device *dev,
|
||||||
|
|
||||||
down_read(&zram->init_lock);
|
down_read(&zram->init_lock);
|
||||||
if (init_done(zram)) {
|
if (init_done(zram)) {
|
||||||
struct zram_meta *meta = zram->meta;
|
|
||||||
atomic_long_set(&zram->stats.max_used_pages,
|
atomic_long_set(&zram->stats.max_used_pages,
|
||||||
zs_get_total_pages(meta->mem_pool));
|
zs_get_total_pages(zram->mem_pool));
|
||||||
}
|
}
|
||||||
up_read(&zram->init_lock);
|
up_read(&zram->init_lock);
|
||||||
|
|
||||||
|
@ -329,7 +336,6 @@ static ssize_t compact_store(struct device *dev,
|
||||||
struct device_attribute *attr, const char *buf, size_t len)
|
struct device_attribute *attr, const char *buf, size_t len)
|
||||||
{
|
{
|
||||||
struct zram *zram = dev_to_zram(dev);
|
struct zram *zram = dev_to_zram(dev);
|
||||||
struct zram_meta *meta;
|
|
||||||
|
|
||||||
down_read(&zram->init_lock);
|
down_read(&zram->init_lock);
|
||||||
if (!init_done(zram)) {
|
if (!init_done(zram)) {
|
||||||
|
@ -337,8 +343,7 @@ static ssize_t compact_store(struct device *dev,
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
meta = zram->meta;
|
zs_compact(zram->mem_pool);
|
||||||
zs_compact(meta->mem_pool);
|
|
||||||
up_read(&zram->init_lock);
|
up_read(&zram->init_lock);
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
|
@ -375,8 +380,8 @@ static ssize_t mm_stat_show(struct device *dev,
|
||||||
|
|
||||||
down_read(&zram->init_lock);
|
down_read(&zram->init_lock);
|
||||||
if (init_done(zram)) {
|
if (init_done(zram)) {
|
||||||
mem_used = zs_get_total_pages(zram->meta->mem_pool);
|
mem_used = zs_get_total_pages(zram->mem_pool);
|
||||||
zs_pool_stats(zram->meta->mem_pool, &pool_stats);
|
zs_pool_stats(zram->mem_pool, &pool_stats);
|
||||||
}
|
}
|
||||||
|
|
||||||
orig_size = atomic64_read(&zram->stats.pages_stored);
|
orig_size = atomic64_read(&zram->stats.pages_stored);
|
||||||
|
@ -417,56 +422,89 @@ static DEVICE_ATTR_RO(io_stat);
|
||||||
static DEVICE_ATTR_RO(mm_stat);
|
static DEVICE_ATTR_RO(mm_stat);
|
||||||
static DEVICE_ATTR_RO(debug_stat);
|
static DEVICE_ATTR_RO(debug_stat);
|
||||||
|
|
||||||
static void zram_meta_free(struct zram_meta *meta, u64 disksize)
|
static void zram_slot_lock(struct zram *zram, u32 index)
|
||||||
|
{
|
||||||
|
bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void zram_slot_unlock(struct zram *zram, u32 index)
|
||||||
|
{
|
||||||
|
bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool zram_same_page_read(struct zram *zram, u32 index,
|
||||||
|
struct page *page,
|
||||||
|
unsigned int offset, unsigned int len)
|
||||||
|
{
|
||||||
|
zram_slot_lock(zram, index);
|
||||||
|
if (unlikely(!zram_get_handle(zram, index) ||
|
||||||
|
zram_test_flag(zram, index, ZRAM_SAME))) {
|
||||||
|
void *mem;
|
||||||
|
|
||||||
|
zram_slot_unlock(zram, index);
|
||||||
|
mem = kmap_atomic(page);
|
||||||
|
zram_fill_page(mem + offset, len,
|
||||||
|
zram_get_element(zram, index));
|
||||||
|
kunmap_atomic(mem);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
zram_slot_unlock(zram, index);
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool zram_same_page_write(struct zram *zram, u32 index,
|
||||||
|
struct page *page)
|
||||||
|
{
|
||||||
|
unsigned long element;
|
||||||
|
void *mem = kmap_atomic(page);
|
||||||
|
|
||||||
|
if (page_same_filled(mem, &element)) {
|
||||||
|
kunmap_atomic(mem);
|
||||||
|
/* Free memory associated with this sector now. */
|
||||||
|
zram_slot_lock(zram, index);
|
||||||
|
zram_free_page(zram, index);
|
||||||
|
zram_set_flag(zram, index, ZRAM_SAME);
|
||||||
|
zram_set_element(zram, index, element);
|
||||||
|
zram_slot_unlock(zram, index);
|
||||||
|
|
||||||
|
atomic64_inc(&zram->stats.same_pages);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
kunmap_atomic(mem);
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void zram_meta_free(struct zram *zram, u64 disksize)
|
||||||
{
|
{
|
||||||
size_t num_pages = disksize >> PAGE_SHIFT;
|
size_t num_pages = disksize >> PAGE_SHIFT;
|
||||||
size_t index;
|
size_t index;
|
||||||
|
|
||||||
/* Free all pages that are still in this zram device */
|
/* Free all pages that are still in this zram device */
|
||||||
for (index = 0; index < num_pages; index++) {
|
for (index = 0; index < num_pages; index++)
|
||||||
unsigned long handle = meta->table[index].handle;
|
zram_free_page(zram, index);
|
||||||
/*
|
|
||||||
* No memory is allocated for same element filled pages.
|
|
||||||
* Simply clear same page flag.
|
|
||||||
*/
|
|
||||||
if (!handle || zram_test_flag(meta, index, ZRAM_SAME))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
zs_free(meta->mem_pool, handle);
|
zs_destroy_pool(zram->mem_pool);
|
||||||
}
|
vfree(zram->table);
|
||||||
|
|
||||||
zs_destroy_pool(meta->mem_pool);
|
|
||||||
vfree(meta->table);
|
|
||||||
kfree(meta);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
|
static bool zram_meta_alloc(struct zram *zram, u64 disksize)
|
||||||
{
|
{
|
||||||
size_t num_pages;
|
size_t num_pages;
|
||||||
struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
|
|
||||||
|
|
||||||
if (!meta)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
num_pages = disksize >> PAGE_SHIFT;
|
num_pages = disksize >> PAGE_SHIFT;
|
||||||
meta->table = vzalloc(num_pages * sizeof(*meta->table));
|
zram->table = vzalloc(num_pages * sizeof(*zram->table));
|
||||||
if (!meta->table) {
|
if (!zram->table)
|
||||||
pr_err("Error allocating zram address table\n");
|
return false;
|
||||||
goto out_error;
|
|
||||||
|
zram->mem_pool = zs_create_pool(zram->disk->disk_name);
|
||||||
|
if (!zram->mem_pool) {
|
||||||
|
vfree(zram->table);
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
meta->mem_pool = zs_create_pool(pool_name);
|
return true;
|
||||||
if (!meta->mem_pool) {
|
|
||||||
pr_err("Error creating memory pool\n");
|
|
||||||
goto out_error;
|
|
||||||
}
|
|
||||||
|
|
||||||
return meta;
|
|
||||||
|
|
||||||
out_error:
|
|
||||||
vfree(meta->table);
|
|
||||||
kfree(meta);
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -476,16 +514,15 @@ out_error:
|
||||||
*/
|
*/
|
||||||
static void zram_free_page(struct zram *zram, size_t index)
|
static void zram_free_page(struct zram *zram, size_t index)
|
||||||
{
|
{
|
||||||
struct zram_meta *meta = zram->meta;
|
unsigned long handle = zram_get_handle(zram, index);
|
||||||
unsigned long handle = meta->table[index].handle;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* No memory is allocated for same element filled pages.
|
* No memory is allocated for same element filled pages.
|
||||||
* Simply clear same page flag.
|
* Simply clear same page flag.
|
||||||
*/
|
*/
|
||||||
if (zram_test_flag(meta, index, ZRAM_SAME)) {
|
if (zram_test_flag(zram, index, ZRAM_SAME)) {
|
||||||
zram_clear_flag(meta, index, ZRAM_SAME);
|
zram_clear_flag(zram, index, ZRAM_SAME);
|
||||||
zram_clear_element(meta, index);
|
zram_set_element(zram, index, 0);
|
||||||
atomic64_dec(&zram->stats.same_pages);
|
atomic64_dec(&zram->stats.same_pages);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -493,179 +530,111 @@ static void zram_free_page(struct zram *zram, size_t index)
|
||||||
if (!handle)
|
if (!handle)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
zs_free(meta->mem_pool, handle);
|
zs_free(zram->mem_pool, handle);
|
||||||
|
|
||||||
atomic64_sub(zram_get_obj_size(meta, index),
|
atomic64_sub(zram_get_obj_size(zram, index),
|
||||||
&zram->stats.compr_data_size);
|
&zram->stats.compr_data_size);
|
||||||
atomic64_dec(&zram->stats.pages_stored);
|
atomic64_dec(&zram->stats.pages_stored);
|
||||||
|
|
||||||
meta->table[index].handle = 0;
|
zram_set_handle(zram, index, 0);
|
||||||
zram_set_obj_size(meta, index, 0);
|
zram_set_obj_size(zram, index, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
|
static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
|
||||||
{
|
{
|
||||||
int ret = 0;
|
int ret;
|
||||||
unsigned char *cmem;
|
|
||||||
struct zram_meta *meta = zram->meta;
|
|
||||||
unsigned long handle;
|
unsigned long handle;
|
||||||
unsigned int size;
|
unsigned int size;
|
||||||
|
void *src, *dst;
|
||||||
|
|
||||||
bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
|
if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE))
|
||||||
handle = meta->table[index].handle;
|
|
||||||
size = zram_get_obj_size(meta, index);
|
|
||||||
|
|
||||||
if (!handle || zram_test_flag(meta, index, ZRAM_SAME)) {
|
|
||||||
bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
|
|
||||||
zram_fill_page(mem, PAGE_SIZE, meta->table[index].element);
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
|
||||||
|
|
||||||
cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
|
zram_slot_lock(zram, index);
|
||||||
|
handle = zram_get_handle(zram, index);
|
||||||
|
size = zram_get_obj_size(zram, index);
|
||||||
|
|
||||||
|
src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
|
||||||
if (size == PAGE_SIZE) {
|
if (size == PAGE_SIZE) {
|
||||||
memcpy(mem, cmem, PAGE_SIZE);
|
dst = kmap_atomic(page);
|
||||||
|
memcpy(dst, src, PAGE_SIZE);
|
||||||
|
kunmap_atomic(dst);
|
||||||
|
ret = 0;
|
||||||
} else {
|
} else {
|
||||||
struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
|
struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
|
||||||
|
|
||||||
ret = zcomp_decompress(zstrm, cmem, size, mem);
|
dst = kmap_atomic(page);
|
||||||
|
ret = zcomp_decompress(zstrm, src, size, dst);
|
||||||
|
kunmap_atomic(dst);
|
||||||
zcomp_stream_put(zram->comp);
|
zcomp_stream_put(zram->comp);
|
||||||
}
|
}
|
||||||
zs_unmap_object(meta->mem_pool, handle);
|
zs_unmap_object(zram->mem_pool, handle);
|
||||||
bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
|
zram_slot_unlock(zram, index);
|
||||||
|
|
||||||
/* Should NEVER happen. Return bio error if it does. */
|
|
||||||
if (unlikely(ret)) {
|
|
||||||
pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
|
|
||||||
u32 index, int offset)
|
|
||||||
{
|
|
||||||
int ret;
|
|
||||||
struct page *page;
|
|
||||||
unsigned char *user_mem, *uncmem = NULL;
|
|
||||||
struct zram_meta *meta = zram->meta;
|
|
||||||
page = bvec->bv_page;
|
|
||||||
|
|
||||||
bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
|
|
||||||
if (unlikely(!meta->table[index].handle) ||
|
|
||||||
zram_test_flag(meta, index, ZRAM_SAME)) {
|
|
||||||
bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
|
|
||||||
handle_same_page(bvec, meta->table[index].element);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
|
|
||||||
|
|
||||||
if (is_partial_io(bvec))
|
|
||||||
/* Use a temporary buffer to decompress the page */
|
|
||||||
uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
|
|
||||||
|
|
||||||
user_mem = kmap_atomic(page);
|
|
||||||
if (!is_partial_io(bvec))
|
|
||||||
uncmem = user_mem;
|
|
||||||
|
|
||||||
if (!uncmem) {
|
|
||||||
pr_err("Unable to allocate temp memory\n");
|
|
||||||
ret = -ENOMEM;
|
|
||||||
goto out_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = zram_decompress_page(zram, uncmem, index);
|
|
||||||
/* Should NEVER happen. Return bio error if it does. */
|
/* Should NEVER happen. Return bio error if it does. */
|
||||||
if (unlikely(ret))
|
if (unlikely(ret))
|
||||||
goto out_cleanup;
|
pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
|
||||||
|
|
||||||
if (is_partial_io(bvec))
|
|
||||||
memcpy(user_mem + bvec->bv_offset, uncmem + offset,
|
|
||||||
bvec->bv_len);
|
|
||||||
|
|
||||||
flush_dcache_page(page);
|
|
||||||
ret = 0;
|
|
||||||
out_cleanup:
|
|
||||||
kunmap_atomic(user_mem);
|
|
||||||
if (is_partial_io(bvec))
|
|
||||||
kfree(uncmem);
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
|
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
|
||||||
int offset)
|
u32 index, int offset)
|
||||||
{
|
{
|
||||||
int ret = 0;
|
int ret;
|
||||||
unsigned int clen;
|
|
||||||
unsigned long handle = 0;
|
|
||||||
struct page *page;
|
struct page *page;
|
||||||
unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
|
|
||||||
struct zram_meta *meta = zram->meta;
|
|
||||||
struct zcomp_strm *zstrm = NULL;
|
|
||||||
unsigned long alloced_pages;
|
|
||||||
unsigned long element;
|
|
||||||
|
|
||||||
page = bvec->bv_page;
|
page = bvec->bv_page;
|
||||||
if (is_partial_io(bvec)) {
|
if (is_partial_io(bvec)) {
|
||||||
/*
|
/* Use a temporary buffer to decompress the page */
|
||||||
* This is a partial IO. We need to read the full page
|
page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
|
||||||
* before to write the changes.
|
if (!page)
|
||||||
*/
|
return -ENOMEM;
|
||||||
uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
|
|
||||||
if (!uncmem) {
|
|
||||||
ret = -ENOMEM;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
ret = zram_decompress_page(zram, uncmem, index);
|
|
||||||
if (ret)
|
|
||||||
goto out;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ret = zram_decompress_page(zram, page, index);
|
||||||
|
if (unlikely(ret))
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
if (is_partial_io(bvec)) {
|
||||||
|
void *dst = kmap_atomic(bvec->bv_page);
|
||||||
|
void *src = kmap_atomic(page);
|
||||||
|
|
||||||
|
memcpy(dst + bvec->bv_offset, src + offset, bvec->bv_len);
|
||||||
|
kunmap_atomic(src);
|
||||||
|
kunmap_atomic(dst);
|
||||||
|
}
|
||||||
|
out:
|
||||||
|
if (is_partial_io(bvec))
|
||||||
|
__free_page(page);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
|
||||||
|
struct page *page,
|
||||||
|
unsigned long *out_handle, unsigned int *out_comp_len)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
unsigned int comp_len;
|
||||||
|
void *src;
|
||||||
|
unsigned long alloced_pages;
|
||||||
|
unsigned long handle = 0;
|
||||||
|
|
||||||
compress_again:
|
compress_again:
|
||||||
user_mem = kmap_atomic(page);
|
src = kmap_atomic(page);
|
||||||
if (is_partial_io(bvec)) {
|
ret = zcomp_compress(*zstrm, src, &comp_len);
|
||||||
memcpy(uncmem + offset, user_mem + bvec->bv_offset,
|
kunmap_atomic(src);
|
||||||
bvec->bv_len);
|
|
||||||
kunmap_atomic(user_mem);
|
|
||||||
user_mem = NULL;
|
|
||||||
} else {
|
|
||||||
uncmem = user_mem;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (page_same_filled(uncmem, &element)) {
|
|
||||||
if (user_mem)
|
|
||||||
kunmap_atomic(user_mem);
|
|
||||||
/* Free memory associated with this sector now. */
|
|
||||||
bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
|
|
||||||
zram_free_page(zram, index);
|
|
||||||
zram_set_flag(meta, index, ZRAM_SAME);
|
|
||||||
zram_set_element(meta, index, element);
|
|
||||||
bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
|
|
||||||
|
|
||||||
atomic64_inc(&zram->stats.same_pages);
|
|
||||||
ret = 0;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
zstrm = zcomp_stream_get(zram->comp);
|
|
||||||
ret = zcomp_compress(zstrm, uncmem, &clen);
|
|
||||||
if (!is_partial_io(bvec)) {
|
|
||||||
kunmap_atomic(user_mem);
|
|
||||||
user_mem = NULL;
|
|
||||||
uncmem = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (unlikely(ret)) {
|
if (unlikely(ret)) {
|
||||||
pr_err("Compression failed! err=%d\n", ret);
|
pr_err("Compression failed! err=%d\n", ret);
|
||||||
goto out;
|
if (handle)
|
||||||
|
zs_free(zram->mem_pool, handle);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
src = zstrm->buffer;
|
if (unlikely(comp_len > max_zpage_size))
|
||||||
if (unlikely(clen > max_zpage_size)) {
|
comp_len = PAGE_SIZE;
|
||||||
clen = PAGE_SIZE;
|
|
||||||
if (is_partial_io(bvec))
|
|
||||||
src = uncmem;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* handle allocation has 2 paths:
|
* handle allocation has 2 paths:
|
||||||
|
@ -681,71 +650,121 @@ compress_again:
|
||||||
* from the slow path and handle has already been allocated.
|
* from the slow path and handle has already been allocated.
|
||||||
*/
|
*/
|
||||||
if (!handle)
|
if (!handle)
|
||||||
handle = zs_malloc(meta->mem_pool, clen,
|
handle = zs_malloc(zram->mem_pool, comp_len,
|
||||||
__GFP_KSWAPD_RECLAIM |
|
__GFP_KSWAPD_RECLAIM |
|
||||||
__GFP_NOWARN |
|
__GFP_NOWARN |
|
||||||
__GFP_HIGHMEM |
|
__GFP_HIGHMEM |
|
||||||
__GFP_MOVABLE);
|
__GFP_MOVABLE);
|
||||||
if (!handle) {
|
if (!handle) {
|
||||||
zcomp_stream_put(zram->comp);
|
zcomp_stream_put(zram->comp);
|
||||||
zstrm = NULL;
|
|
||||||
|
|
||||||
atomic64_inc(&zram->stats.writestall);
|
atomic64_inc(&zram->stats.writestall);
|
||||||
|
handle = zs_malloc(zram->mem_pool, comp_len,
|
||||||
handle = zs_malloc(meta->mem_pool, clen,
|
|
||||||
GFP_NOIO | __GFP_HIGHMEM |
|
GFP_NOIO | __GFP_HIGHMEM |
|
||||||
__GFP_MOVABLE);
|
__GFP_MOVABLE);
|
||||||
|
*zstrm = zcomp_stream_get(zram->comp);
|
||||||
if (handle)
|
if (handle)
|
||||||
goto compress_again;
|
goto compress_again;
|
||||||
|
return -ENOMEM;
|
||||||
pr_err("Error allocating memory for compressed page: %u, size=%u\n",
|
|
||||||
index, clen);
|
|
||||||
ret = -ENOMEM;
|
|
||||||
goto out;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
alloced_pages = zs_get_total_pages(meta->mem_pool);
|
alloced_pages = zs_get_total_pages(zram->mem_pool);
|
||||||
update_used_max(zram, alloced_pages);
|
update_used_max(zram, alloced_pages);
|
||||||
|
|
||||||
if (zram->limit_pages && alloced_pages > zram->limit_pages) {
|
if (zram->limit_pages && alloced_pages > zram->limit_pages) {
|
||||||
zs_free(meta->mem_pool, handle);
|
zs_free(zram->mem_pool, handle);
|
||||||
ret = -ENOMEM;
|
return -ENOMEM;
|
||||||
goto out;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
|
*out_handle = handle;
|
||||||
|
*out_comp_len = comp_len;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
|
static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
unsigned long handle;
|
||||||
|
unsigned int comp_len;
|
||||||
|
void *src, *dst;
|
||||||
|
struct zcomp_strm *zstrm;
|
||||||
|
struct page *page = bvec->bv_page;
|
||||||
|
|
||||||
|
if (zram_same_page_write(zram, index, page))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
zstrm = zcomp_stream_get(zram->comp);
|
||||||
|
ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
|
||||||
|
if (ret) {
|
||||||
|
zcomp_stream_put(zram->comp);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
|
||||||
|
|
||||||
|
src = zstrm->buffer;
|
||||||
|
if (comp_len == PAGE_SIZE)
|
||||||
src = kmap_atomic(page);
|
src = kmap_atomic(page);
|
||||||
memcpy(cmem, src, PAGE_SIZE);
|
memcpy(dst, src, comp_len);
|
||||||
|
if (comp_len == PAGE_SIZE)
|
||||||
kunmap_atomic(src);
|
kunmap_atomic(src);
|
||||||
} else {
|
|
||||||
memcpy(cmem, src, clen);
|
|
||||||
}
|
|
||||||
|
|
||||||
zcomp_stream_put(zram->comp);
|
zcomp_stream_put(zram->comp);
|
||||||
zstrm = NULL;
|
zs_unmap_object(zram->mem_pool, handle);
|
||||||
zs_unmap_object(meta->mem_pool, handle);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Free memory associated with this sector
|
* Free memory associated with this sector
|
||||||
* before overwriting unused sectors.
|
* before overwriting unused sectors.
|
||||||
*/
|
*/
|
||||||
bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
|
zram_slot_lock(zram, index);
|
||||||
zram_free_page(zram, index);
|
zram_free_page(zram, index);
|
||||||
|
zram_set_handle(zram, index, handle);
|
||||||
meta->table[index].handle = handle;
|
zram_set_obj_size(zram, index, comp_len);
|
||||||
zram_set_obj_size(meta, index, clen);
|
zram_slot_unlock(zram, index);
|
||||||
bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
|
|
||||||
|
|
||||||
/* Update stats */
|
/* Update stats */
|
||||||
atomic64_add(clen, &zram->stats.compr_data_size);
|
atomic64_add(comp_len, &zram->stats.compr_data_size);
|
||||||
atomic64_inc(&zram->stats.pages_stored);
|
atomic64_inc(&zram->stats.pages_stored);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
|
||||||
|
u32 index, int offset)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
struct page *page = NULL;
|
||||||
|
void *src;
|
||||||
|
struct bio_vec vec;
|
||||||
|
|
||||||
|
vec = *bvec;
|
||||||
|
if (is_partial_io(bvec)) {
|
||||||
|
void *dst;
|
||||||
|
/*
|
||||||
|
* This is a partial IO. We need to read the full page
|
||||||
|
* before to write the changes.
|
||||||
|
*/
|
||||||
|
page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
|
||||||
|
if (!page)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
ret = zram_decompress_page(zram, page, index);
|
||||||
|
if (ret)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
src = kmap_atomic(bvec->bv_page);
|
||||||
|
dst = kmap_atomic(page);
|
||||||
|
memcpy(dst + offset, src + bvec->bv_offset, bvec->bv_len);
|
||||||
|
kunmap_atomic(dst);
|
||||||
|
kunmap_atomic(src);
|
||||||
|
|
||||||
|
vec.bv_page = page;
|
||||||
|
vec.bv_len = PAGE_SIZE;
|
||||||
|
vec.bv_offset = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = __zram_bvec_write(zram, &vec, index);
|
||||||
out:
|
out:
|
||||||
if (zstrm)
|
|
||||||
zcomp_stream_put(zram->comp);
|
|
||||||
if (is_partial_io(bvec))
|
if (is_partial_io(bvec))
|
||||||
kfree(uncmem);
|
__free_page(page);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -758,7 +777,6 @@ static void zram_bio_discard(struct zram *zram, u32 index,
|
||||||
int offset, struct bio *bio)
|
int offset, struct bio *bio)
|
||||||
{
|
{
|
||||||
size_t n = bio->bi_iter.bi_size;
|
size_t n = bio->bi_iter.bi_size;
|
||||||
struct zram_meta *meta = zram->meta;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* zram manages data in physical block size units. Because logical block
|
* zram manages data in physical block size units. Because logical block
|
||||||
|
@ -779,9 +797,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
|
||||||
}
|
}
|
||||||
|
|
||||||
while (n >= PAGE_SIZE) {
|
while (n >= PAGE_SIZE) {
|
||||||
bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
|
zram_slot_lock(zram, index);
|
||||||
zram_free_page(zram, index);
|
zram_free_page(zram, index);
|
||||||
bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
|
zram_slot_unlock(zram, index);
|
||||||
atomic64_inc(&zram->stats.notify_free);
|
atomic64_inc(&zram->stats.notify_free);
|
||||||
index++;
|
index++;
|
||||||
n -= PAGE_SIZE;
|
n -= PAGE_SIZE;
|
||||||
|
@ -801,6 +819,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
|
||||||
if (!is_write) {
|
if (!is_write) {
|
||||||
atomic64_inc(&zram->stats.num_reads);
|
atomic64_inc(&zram->stats.num_reads);
|
||||||
ret = zram_bvec_read(zram, bvec, index, offset);
|
ret = zram_bvec_read(zram, bvec, index, offset);
|
||||||
|
flush_dcache_page(bvec->bv_page);
|
||||||
} else {
|
} else {
|
||||||
atomic64_inc(&zram->stats.num_writes);
|
atomic64_inc(&zram->stats.num_writes);
|
||||||
ret = zram_bvec_write(zram, bvec, index, offset);
|
ret = zram_bvec_write(zram, bvec, index, offset);
|
||||||
|
@ -840,34 +859,21 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
|
||||||
}
|
}
|
||||||
|
|
||||||
bio_for_each_segment(bvec, bio, iter) {
|
bio_for_each_segment(bvec, bio, iter) {
|
||||||
int max_transfer_size = PAGE_SIZE - offset;
|
struct bio_vec bv = bvec;
|
||||||
|
unsigned int unwritten = bvec.bv_len;
|
||||||
if (bvec.bv_len > max_transfer_size) {
|
|
||||||
/*
|
|
||||||
* zram_bvec_rw() can only make operation on a single
|
|
||||||
* zram page. Split the bio vector.
|
|
||||||
*/
|
|
||||||
struct bio_vec bv;
|
|
||||||
|
|
||||||
bv.bv_page = bvec.bv_page;
|
|
||||||
bv.bv_len = max_transfer_size;
|
|
||||||
bv.bv_offset = bvec.bv_offset;
|
|
||||||
|
|
||||||
|
do {
|
||||||
|
bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
|
||||||
|
unwritten);
|
||||||
if (zram_bvec_rw(zram, &bv, index, offset,
|
if (zram_bvec_rw(zram, &bv, index, offset,
|
||||||
op_is_write(bio_op(bio))) < 0)
|
op_is_write(bio_op(bio))) < 0)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
bv.bv_len = bvec.bv_len - max_transfer_size;
|
bv.bv_offset += bv.bv_len;
|
||||||
bv.bv_offset += max_transfer_size;
|
unwritten -= bv.bv_len;
|
||||||
if (zram_bvec_rw(zram, &bv, index + 1, 0,
|
|
||||||
op_is_write(bio_op(bio))) < 0)
|
|
||||||
goto out;
|
|
||||||
} else
|
|
||||||
if (zram_bvec_rw(zram, &bvec, index, offset,
|
|
||||||
op_is_write(bio_op(bio))) < 0)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
update_position(&index, &offset, &bvec);
|
update_position(&index, &offset, &bv);
|
||||||
|
} while (unwritten);
|
||||||
}
|
}
|
||||||
|
|
||||||
bio_endio(bio);
|
bio_endio(bio);
|
||||||
|
@ -884,8 +890,6 @@ static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio)
|
||||||
{
|
{
|
||||||
struct zram *zram = queue->queuedata;
|
struct zram *zram = queue->queuedata;
|
||||||
|
|
||||||
blk_queue_split(queue, &bio, queue->bio_split);
|
|
||||||
|
|
||||||
if (!valid_io_request(zram, bio->bi_iter.bi_sector,
|
if (!valid_io_request(zram, bio->bi_iter.bi_sector,
|
||||||
bio->bi_iter.bi_size)) {
|
bio->bi_iter.bi_size)) {
|
||||||
atomic64_inc(&zram->stats.invalid_io);
|
atomic64_inc(&zram->stats.invalid_io);
|
||||||
|
@ -904,14 +908,12 @@ static void zram_slot_free_notify(struct block_device *bdev,
|
||||||
unsigned long index)
|
unsigned long index)
|
||||||
{
|
{
|
||||||
struct zram *zram;
|
struct zram *zram;
|
||||||
struct zram_meta *meta;
|
|
||||||
|
|
||||||
zram = bdev->bd_disk->private_data;
|
zram = bdev->bd_disk->private_data;
|
||||||
meta = zram->meta;
|
|
||||||
|
|
||||||
bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
|
zram_slot_lock(zram, index);
|
||||||
zram_free_page(zram, index);
|
zram_free_page(zram, index);
|
||||||
bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
|
zram_slot_unlock(zram, index);
|
||||||
atomic64_inc(&zram->stats.notify_free);
|
atomic64_inc(&zram->stats.notify_free);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -955,7 +957,6 @@ out:
|
||||||
|
|
||||||
static void zram_reset_device(struct zram *zram)
|
static void zram_reset_device(struct zram *zram)
|
||||||
{
|
{
|
||||||
struct zram_meta *meta;
|
|
||||||
struct zcomp *comp;
|
struct zcomp *comp;
|
||||||
u64 disksize;
|
u64 disksize;
|
||||||
|
|
||||||
|
@ -968,12 +969,8 @@ static void zram_reset_device(struct zram *zram)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
meta = zram->meta;
|
|
||||||
comp = zram->comp;
|
comp = zram->comp;
|
||||||
disksize = zram->disksize;
|
disksize = zram->disksize;
|
||||||
|
|
||||||
/* Reset stats */
|
|
||||||
memset(&zram->stats, 0, sizeof(zram->stats));
|
|
||||||
zram->disksize = 0;
|
zram->disksize = 0;
|
||||||
|
|
||||||
set_capacity(zram->disk, 0);
|
set_capacity(zram->disk, 0);
|
||||||
|
@ -981,7 +978,8 @@ static void zram_reset_device(struct zram *zram)
|
||||||
|
|
||||||
up_write(&zram->init_lock);
|
up_write(&zram->init_lock);
|
||||||
/* I/O operation under all of CPU are done so let's free */
|
/* I/O operation under all of CPU are done so let's free */
|
||||||
zram_meta_free(meta, disksize);
|
zram_meta_free(zram, disksize);
|
||||||
|
memset(&zram->stats, 0, sizeof(zram->stats));
|
||||||
zcomp_destroy(comp);
|
zcomp_destroy(comp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -990,7 +988,6 @@ static ssize_t disksize_store(struct device *dev,
|
||||||
{
|
{
|
||||||
u64 disksize;
|
u64 disksize;
|
||||||
struct zcomp *comp;
|
struct zcomp *comp;
|
||||||
struct zram_meta *meta;
|
|
||||||
struct zram *zram = dev_to_zram(dev);
|
struct zram *zram = dev_to_zram(dev);
|
||||||
int err;
|
int err;
|
||||||
|
|
||||||
|
@ -998,10 +995,18 @@ static ssize_t disksize_store(struct device *dev,
|
||||||
if (!disksize)
|
if (!disksize)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
|
down_write(&zram->init_lock);
|
||||||
|
if (init_done(zram)) {
|
||||||
|
pr_info("Cannot change disksize for initialized device\n");
|
||||||
|
err = -EBUSY;
|
||||||
|
goto out_unlock;
|
||||||
|
}
|
||||||
|
|
||||||
disksize = PAGE_ALIGN(disksize);
|
disksize = PAGE_ALIGN(disksize);
|
||||||
meta = zram_meta_alloc(zram->disk->disk_name, disksize);
|
if (!zram_meta_alloc(zram, disksize)) {
|
||||||
if (!meta)
|
err = -ENOMEM;
|
||||||
return -ENOMEM;
|
goto out_unlock;
|
||||||
|
}
|
||||||
|
|
||||||
comp = zcomp_create(zram->compressor);
|
comp = zcomp_create(zram->compressor);
|
||||||
if (IS_ERR(comp)) {
|
if (IS_ERR(comp)) {
|
||||||
|
@ -1011,14 +1016,6 @@ static ssize_t disksize_store(struct device *dev,
|
||||||
goto out_free_meta;
|
goto out_free_meta;
|
||||||
}
|
}
|
||||||
|
|
||||||
down_write(&zram->init_lock);
|
|
||||||
if (init_done(zram)) {
|
|
||||||
pr_info("Cannot change disksize for initialized device\n");
|
|
||||||
err = -EBUSY;
|
|
||||||
goto out_destroy_comp;
|
|
||||||
}
|
|
||||||
|
|
||||||
zram->meta = meta;
|
|
||||||
zram->comp = comp;
|
zram->comp = comp;
|
||||||
zram->disksize = disksize;
|
zram->disksize = disksize;
|
||||||
set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
|
set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
|
||||||
|
@ -1027,11 +1024,10 @@ static ssize_t disksize_store(struct device *dev,
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
|
|
||||||
out_destroy_comp:
|
|
||||||
up_write(&zram->init_lock);
|
|
||||||
zcomp_destroy(comp);
|
|
||||||
out_free_meta:
|
out_free_meta:
|
||||||
zram_meta_free(meta, disksize);
|
zram_meta_free(zram, disksize);
|
||||||
|
out_unlock:
|
||||||
|
up_write(&zram->init_lock);
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1193,8 +1189,6 @@ static int zram_add(void)
|
||||||
blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
|
blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
|
||||||
blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
|
blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
|
||||||
zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
|
zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
|
||||||
zram->disk->queue->limits.max_sectors = SECTORS_PER_PAGE;
|
|
||||||
zram->disk->queue->limits.chunk_sectors = 0;
|
|
||||||
blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
|
blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
|
||||||
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
|
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
|
||||||
|
|
||||||
|
@ -1219,7 +1213,6 @@ static int zram_add(void)
|
||||||
goto out_free_disk;
|
goto out_free_disk;
|
||||||
}
|
}
|
||||||
strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
|
strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
|
||||||
zram->meta = NULL;
|
|
||||||
|
|
||||||
pr_info("Added device: %s\n", zram->disk->disk_name);
|
pr_info("Added device: %s\n", zram->disk->disk_name);
|
||||||
return device_id;
|
return device_id;
|
||||||
|
|
|
@ -92,13 +92,9 @@ struct zram_stats {
|
||||||
atomic64_t writestall; /* no. of write slow paths */
|
atomic64_t writestall; /* no. of write slow paths */
|
||||||
};
|
};
|
||||||
|
|
||||||
struct zram_meta {
|
struct zram {
|
||||||
struct zram_table_entry *table;
|
struct zram_table_entry *table;
|
||||||
struct zs_pool *mem_pool;
|
struct zs_pool *mem_pool;
|
||||||
};
|
|
||||||
|
|
||||||
struct zram {
|
|
||||||
struct zram_meta *meta;
|
|
||||||
struct zcomp *comp;
|
struct zcomp *comp;
|
||||||
struct gendisk *disk;
|
struct gendisk *disk;
|
||||||
/* Prevent concurrent execution of device init */
|
/* Prevent concurrent execution of device init */
|
||||||
|
|
|
@ -372,7 +372,7 @@ static void moom_callback(struct work_struct *ignored)
|
||||||
|
|
||||||
mutex_lock(&oom_lock);
|
mutex_lock(&oom_lock);
|
||||||
if (!out_of_memory(&oc))
|
if (!out_of_memory(&oc))
|
||||||
pr_info("OOM request ignored because killer is disabled\n");
|
pr_info("OOM request ignored. No task eligible\n");
|
||||||
mutex_unlock(&oom_lock);
|
mutex_unlock(&oom_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -103,12 +103,11 @@ void invalidate_bdev(struct block_device *bdev)
|
||||||
{
|
{
|
||||||
struct address_space *mapping = bdev->bd_inode->i_mapping;
|
struct address_space *mapping = bdev->bd_inode->i_mapping;
|
||||||
|
|
||||||
if (mapping->nrpages == 0)
|
if (mapping->nrpages) {
|
||||||
return;
|
invalidate_bh_lrus();
|
||||||
|
lru_add_drain_all(); /* make sure all lru add caches are flushed */
|
||||||
invalidate_bh_lrus();
|
invalidate_mapping_pages(mapping, 0, -1);
|
||||||
lru_add_drain_all(); /* make sure all lru add caches are flushed */
|
}
|
||||||
invalidate_mapping_pages(mapping, 0, -1);
|
|
||||||
/* 99% of the time, we don't need to flush the cleancache on the bdev.
|
/* 99% of the time, we don't need to flush the cleancache on the bdev.
|
||||||
* But, for the strange corners, lets be cautious
|
* But, for the strange corners, lets be cautious
|
||||||
*/
|
*/
|
||||||
|
|
18
fs/iomap.c
18
fs/iomap.c
|
@ -887,16 +887,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||||
flags |= IOMAP_WRITE;
|
flags |= IOMAP_WRITE;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mapping->nrpages) {
|
ret = filemap_write_and_wait_range(mapping, start, end);
|
||||||
ret = filemap_write_and_wait_range(mapping, start, end);
|
if (ret)
|
||||||
if (ret)
|
goto out_free_dio;
|
||||||
goto out_free_dio;
|
|
||||||
|
|
||||||
ret = invalidate_inode_pages2_range(mapping,
|
ret = invalidate_inode_pages2_range(mapping,
|
||||||
start >> PAGE_SHIFT, end >> PAGE_SHIFT);
|
start >> PAGE_SHIFT, end >> PAGE_SHIFT);
|
||||||
WARN_ON_ONCE(ret);
|
WARN_ON_ONCE(ret);
|
||||||
ret = 0;
|
ret = 0;
|
||||||
}
|
|
||||||
|
|
||||||
inode_dio_begin(inode);
|
inode_dio_begin(inode);
|
||||||
|
|
||||||
|
@ -951,7 +949,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||||
* one is a pretty crazy thing to do, so we don't support it 100%. If
|
* one is a pretty crazy thing to do, so we don't support it 100%. If
|
||||||
* this invalidation fails, tough, the write still worked...
|
* this invalidation fails, tough, the write still worked...
|
||||||
*/
|
*/
|
||||||
if (iov_iter_rw(iter) == WRITE && mapping->nrpages) {
|
if (iov_iter_rw(iter) == WRITE) {
|
||||||
int err = invalidate_inode_pages2_range(mapping,
|
int err = invalidate_inode_pages2_range(mapping,
|
||||||
start >> PAGE_SHIFT, end >> PAGE_SHIFT);
|
start >> PAGE_SHIFT, end >> PAGE_SHIFT);
|
||||||
WARN_ON_ONCE(err);
|
WARN_ON_ONCE(err);
|
||||||
|
|
|
@ -43,6 +43,7 @@
|
||||||
#include <linux/backing-dev.h>
|
#include <linux/backing-dev.h>
|
||||||
#include <linux/bitops.h>
|
#include <linux/bitops.h>
|
||||||
#include <linux/ratelimit.h>
|
#include <linux/ratelimit.h>
|
||||||
|
#include <linux/sched/mm.h>
|
||||||
|
|
||||||
#define CREATE_TRACE_POINTS
|
#define CREATE_TRACE_POINTS
|
||||||
#include <trace/events/jbd2.h>
|
#include <trace/events/jbd2.h>
|
||||||
|
@ -205,6 +206,14 @@ static int kjournald2(void *arg)
|
||||||
journal->j_task = current;
|
journal->j_task = current;
|
||||||
wake_up(&journal->j_wait_done_commit);
|
wake_up(&journal->j_wait_done_commit);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Make sure that no allocations from this kernel thread will ever
|
||||||
|
* recurse to the fs layer because we are responsible for the
|
||||||
|
* transaction commit and any fs involvement might get stuck waiting for
|
||||||
|
* the trasn. commit.
|
||||||
|
*/
|
||||||
|
memalloc_nofs_save();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* And now, wait forever for commit wakeup events.
|
* And now, wait forever for commit wakeup events.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -29,6 +29,7 @@
|
||||||
#include <linux/backing-dev.h>
|
#include <linux/backing-dev.h>
|
||||||
#include <linux/bug.h>
|
#include <linux/bug.h>
|
||||||
#include <linux/module.h>
|
#include <linux/module.h>
|
||||||
|
#include <linux/sched/mm.h>
|
||||||
|
|
||||||
#include <trace/events/jbd2.h>
|
#include <trace/events/jbd2.h>
|
||||||
|
|
||||||
|
@ -388,6 +389,11 @@ repeat:
|
||||||
|
|
||||||
rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
|
rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
|
||||||
jbd2_journal_free_transaction(new_transaction);
|
jbd2_journal_free_transaction(new_transaction);
|
||||||
|
/*
|
||||||
|
* Ensure that no allocations done while the transaction is open are
|
||||||
|
* going to recurse back to the fs layer.
|
||||||
|
*/
|
||||||
|
handle->saved_alloc_context = memalloc_nofs_save();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -466,6 +472,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
|
||||||
trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
|
trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
|
||||||
handle->h_transaction->t_tid, type,
|
handle->h_transaction->t_tid, type,
|
||||||
line_no, nblocks);
|
line_no, nblocks);
|
||||||
|
|
||||||
return handle;
|
return handle;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(jbd2__journal_start);
|
EXPORT_SYMBOL(jbd2__journal_start);
|
||||||
|
@ -1760,6 +1767,11 @@ int jbd2_journal_stop(handle_t *handle)
|
||||||
if (handle->h_rsv_handle)
|
if (handle->h_rsv_handle)
|
||||||
jbd2_journal_free_reserved(handle->h_rsv_handle);
|
jbd2_journal_free_reserved(handle->h_rsv_handle);
|
||||||
free_and_exit:
|
free_and_exit:
|
||||||
|
/*
|
||||||
|
* Scope of the GFP_NOFS context is over here and so we can restore the
|
||||||
|
* original alloc context.
|
||||||
|
*/
|
||||||
|
memalloc_nofs_restore(handle->saved_alloc_context);
|
||||||
jbd2_free_handle(handle);
|
jbd2_free_handle(handle);
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
|
@ -2242,13 +2242,13 @@ unlock:
|
||||||
spin_unlock(&o2hb_live_lock);
|
spin_unlock(&o2hb_live_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item,
|
static ssize_t o2hb_heartbeat_group_dead_threshold_show(struct config_item *item,
|
||||||
char *page)
|
char *page)
|
||||||
{
|
{
|
||||||
return sprintf(page, "%u\n", o2hb_dead_threshold);
|
return sprintf(page, "%u\n", o2hb_dead_threshold);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item,
|
static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *item,
|
||||||
const char *page, size_t count)
|
const char *page, size_t count)
|
||||||
{
|
{
|
||||||
unsigned long tmp;
|
unsigned long tmp;
|
||||||
|
@ -2297,11 +2297,11 @@ static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item,
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold);
|
CONFIGFS_ATTR(o2hb_heartbeat_group_, dead_threshold);
|
||||||
CONFIGFS_ATTR(o2hb_heartbeat_group_, mode);
|
CONFIGFS_ATTR(o2hb_heartbeat_group_, mode);
|
||||||
|
|
||||||
static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
|
static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
|
||||||
&o2hb_heartbeat_group_attr_threshold,
|
&o2hb_heartbeat_group_attr_dead_threshold,
|
||||||
&o2hb_heartbeat_group_attr_mode,
|
&o2hb_heartbeat_group_attr_mode,
|
||||||
NULL,
|
NULL,
|
||||||
};
|
};
|
||||||
|
|
|
@ -450,9 +450,8 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
|
||||||
INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
|
INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
|
||||||
INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req);
|
INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req);
|
||||||
|
|
||||||
init_timer(&sc->sc_idle_timeout);
|
setup_timer(&sc->sc_idle_timeout, o2net_idle_timer,
|
||||||
sc->sc_idle_timeout.function = o2net_idle_timer;
|
(unsigned long)sc);
|
||||||
sc->sc_idle_timeout.data = (unsigned long)sc;
|
|
||||||
|
|
||||||
sclog(sc, "alloced\n");
|
sclog(sc, "alloced\n");
|
||||||
|
|
||||||
|
@ -956,7 +955,7 @@ static void o2net_sendpage(struct o2net_sock_container *sc,
|
||||||
mutex_lock(&sc->sc_send_lock);
|
mutex_lock(&sc->sc_send_lock);
|
||||||
ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
|
ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
|
||||||
virt_to_page(kmalloced_virt),
|
virt_to_page(kmalloced_virt),
|
||||||
(long)kmalloced_virt & ~PAGE_MASK,
|
offset_in_page(kmalloced_virt),
|
||||||
size, MSG_DONTWAIT);
|
size, MSG_DONTWAIT);
|
||||||
mutex_unlock(&sc->sc_send_lock);
|
mutex_unlock(&sc->sc_send_lock);
|
||||||
if (ret == size)
|
if (ret == size)
|
||||||
|
|
|
@ -441,6 +441,7 @@ struct mem_size_stats {
|
||||||
unsigned long private_dirty;
|
unsigned long private_dirty;
|
||||||
unsigned long referenced;
|
unsigned long referenced;
|
||||||
unsigned long anonymous;
|
unsigned long anonymous;
|
||||||
|
unsigned long lazyfree;
|
||||||
unsigned long anonymous_thp;
|
unsigned long anonymous_thp;
|
||||||
unsigned long shmem_thp;
|
unsigned long shmem_thp;
|
||||||
unsigned long swap;
|
unsigned long swap;
|
||||||
|
@ -457,8 +458,11 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
|
||||||
int i, nr = compound ? 1 << compound_order(page) : 1;
|
int i, nr = compound ? 1 << compound_order(page) : 1;
|
||||||
unsigned long size = nr * PAGE_SIZE;
|
unsigned long size = nr * PAGE_SIZE;
|
||||||
|
|
||||||
if (PageAnon(page))
|
if (PageAnon(page)) {
|
||||||
mss->anonymous += size;
|
mss->anonymous += size;
|
||||||
|
if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
|
||||||
|
mss->lazyfree += size;
|
||||||
|
}
|
||||||
|
|
||||||
mss->resident += size;
|
mss->resident += size;
|
||||||
/* Accumulate the size in pages that have been accessed. */
|
/* Accumulate the size in pages that have been accessed. */
|
||||||
|
@ -771,6 +775,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
|
||||||
"Private_Dirty: %8lu kB\n"
|
"Private_Dirty: %8lu kB\n"
|
||||||
"Referenced: %8lu kB\n"
|
"Referenced: %8lu kB\n"
|
||||||
"Anonymous: %8lu kB\n"
|
"Anonymous: %8lu kB\n"
|
||||||
|
"LazyFree: %8lu kB\n"
|
||||||
"AnonHugePages: %8lu kB\n"
|
"AnonHugePages: %8lu kB\n"
|
||||||
"ShmemPmdMapped: %8lu kB\n"
|
"ShmemPmdMapped: %8lu kB\n"
|
||||||
"Shared_Hugetlb: %8lu kB\n"
|
"Shared_Hugetlb: %8lu kB\n"
|
||||||
|
@ -789,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
|
||||||
mss.private_dirty >> 10,
|
mss.private_dirty >> 10,
|
||||||
mss.referenced >> 10,
|
mss.referenced >> 10,
|
||||||
mss.anonymous >> 10,
|
mss.anonymous >> 10,
|
||||||
|
mss.lazyfree >> 10,
|
||||||
mss.anonymous_thp >> 10,
|
mss.anonymous_thp >> 10,
|
||||||
mss.shmem_thp >> 10,
|
mss.shmem_thp >> 10,
|
||||||
mss.shared_hugetlb >> 10,
|
mss.shared_hugetlb >> 10,
|
||||||
|
|
|
@ -48,7 +48,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
|
||||||
void *
|
void *
|
||||||
kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
|
kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
|
||||||
{
|
{
|
||||||
unsigned noio_flag = 0;
|
unsigned nofs_flag = 0;
|
||||||
void *ptr;
|
void *ptr;
|
||||||
gfp_t lflags;
|
gfp_t lflags;
|
||||||
|
|
||||||
|
@ -60,17 +60,17 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
|
||||||
* __vmalloc() will allocate data pages and auxillary structures (e.g.
|
* __vmalloc() will allocate data pages and auxillary structures (e.g.
|
||||||
* pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
|
* pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
|
||||||
* here. Hence we need to tell memory reclaim that we are in such a
|
* here. Hence we need to tell memory reclaim that we are in such a
|
||||||
* context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
|
* context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
|
||||||
* the filesystem here and potentially deadlocking.
|
* the filesystem here and potentially deadlocking.
|
||||||
*/
|
*/
|
||||||
if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
|
if (flags & KM_NOFS)
|
||||||
noio_flag = memalloc_noio_save();
|
nofs_flag = memalloc_nofs_save();
|
||||||
|
|
||||||
lflags = kmem_flags_convert(flags);
|
lflags = kmem_flags_convert(flags);
|
||||||
ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
|
ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
|
||||||
|
|
||||||
if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
|
if (flags & KM_NOFS)
|
||||||
memalloc_noio_restore(noio_flag);
|
memalloc_nofs_restore(nofs_flag);
|
||||||
|
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
|
||||||
lflags = GFP_ATOMIC | __GFP_NOWARN;
|
lflags = GFP_ATOMIC | __GFP_NOWARN;
|
||||||
} else {
|
} else {
|
||||||
lflags = GFP_KERNEL | __GFP_NOWARN;
|
lflags = GFP_KERNEL | __GFP_NOWARN;
|
||||||
if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
|
if (flags & KM_NOFS)
|
||||||
lflags &= ~__GFP_FS;
|
lflags &= ~__GFP_FS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2886,7 +2886,7 @@ xfs_btree_split_worker(
|
||||||
struct xfs_btree_split_args *args = container_of(work,
|
struct xfs_btree_split_args *args = container_of(work,
|
||||||
struct xfs_btree_split_args, work);
|
struct xfs_btree_split_args, work);
|
||||||
unsigned long pflags;
|
unsigned long pflags;
|
||||||
unsigned long new_pflags = PF_FSTRANS;
|
unsigned long new_pflags = PF_MEMALLOC_NOFS;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* we are in a transaction context here, but may also be doing work
|
* we are in a transaction context here, but may also be doing work
|
||||||
|
|
|
@ -189,7 +189,7 @@ xfs_setfilesize_trans_alloc(
|
||||||
* We hand off the transaction to the completion thread now, so
|
* We hand off the transaction to the completion thread now, so
|
||||||
* clear the flag here.
|
* clear the flag here.
|
||||||
*/
|
*/
|
||||||
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
|
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -252,7 +252,7 @@ xfs_setfilesize_ioend(
|
||||||
* thus we need to mark ourselves as being in a transaction manually.
|
* thus we need to mark ourselves as being in a transaction manually.
|
||||||
* Similarly for freeze protection.
|
* Similarly for freeze protection.
|
||||||
*/
|
*/
|
||||||
current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
|
current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
|
||||||
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
|
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
|
||||||
|
|
||||||
/* we abort the update if there was an IO error */
|
/* we abort the update if there was an IO error */
|
||||||
|
@ -1016,7 +1016,7 @@ xfs_do_writepage(
|
||||||
* Given that we do not allow direct reclaim to call us, we should
|
* Given that we do not allow direct reclaim to call us, we should
|
||||||
* never be called while in a filesystem transaction.
|
* never be called while in a filesystem transaction.
|
||||||
*/
|
*/
|
||||||
if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
|
if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
|
||||||
goto redirty;
|
goto redirty;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -443,17 +443,17 @@ _xfs_buf_map_pages(
|
||||||
bp->b_addr = NULL;
|
bp->b_addr = NULL;
|
||||||
} else {
|
} else {
|
||||||
int retried = 0;
|
int retried = 0;
|
||||||
unsigned noio_flag;
|
unsigned nofs_flag;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* vm_map_ram() will allocate auxillary structures (e.g.
|
* vm_map_ram() will allocate auxillary structures (e.g.
|
||||||
* pagetables) with GFP_KERNEL, yet we are likely to be under
|
* pagetables) with GFP_KERNEL, yet we are likely to be under
|
||||||
* GFP_NOFS context here. Hence we need to tell memory reclaim
|
* GFP_NOFS context here. Hence we need to tell memory reclaim
|
||||||
* that we are in such a context via PF_MEMALLOC_NOIO to prevent
|
* that we are in such a context via PF_MEMALLOC_NOFS to prevent
|
||||||
* memory reclaim re-entering the filesystem here and
|
* memory reclaim re-entering the filesystem here and
|
||||||
* potentially deadlocking.
|
* potentially deadlocking.
|
||||||
*/
|
*/
|
||||||
noio_flag = memalloc_noio_save();
|
nofs_flag = memalloc_nofs_save();
|
||||||
do {
|
do {
|
||||||
bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
|
bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
|
||||||
-1, PAGE_KERNEL);
|
-1, PAGE_KERNEL);
|
||||||
|
@ -461,7 +461,7 @@ _xfs_buf_map_pages(
|
||||||
break;
|
break;
|
||||||
vm_unmap_aliases();
|
vm_unmap_aliases();
|
||||||
} while (retried++ <= 1);
|
} while (retried++ <= 1);
|
||||||
memalloc_noio_restore(noio_flag);
|
memalloc_nofs_restore(nofs_flag);
|
||||||
|
|
||||||
if (!bp->b_addr)
|
if (!bp->b_addr)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
|
@ -134,7 +134,7 @@ xfs_trans_reserve(
|
||||||
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
|
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
|
||||||
|
|
||||||
/* Mark this thread as being in a transaction */
|
/* Mark this thread as being in a transaction */
|
||||||
current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
|
current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Attempt to reserve the needed disk blocks by decrementing
|
* Attempt to reserve the needed disk blocks by decrementing
|
||||||
|
@ -144,7 +144,7 @@ xfs_trans_reserve(
|
||||||
if (blocks > 0) {
|
if (blocks > 0) {
|
||||||
error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
|
error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
|
||||||
if (error != 0) {
|
if (error != 0) {
|
||||||
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
|
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
|
||||||
return -ENOSPC;
|
return -ENOSPC;
|
||||||
}
|
}
|
||||||
tp->t_blk_res += blocks;
|
tp->t_blk_res += blocks;
|
||||||
|
@ -221,7 +221,7 @@ undo_blocks:
|
||||||
tp->t_blk_res = 0;
|
tp->t_blk_res = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
|
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
|
||||||
|
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
@ -914,7 +914,7 @@ __xfs_trans_commit(
|
||||||
|
|
||||||
xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
|
xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
|
||||||
|
|
||||||
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
|
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
|
||||||
xfs_trans_free(tp);
|
xfs_trans_free(tp);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -944,7 +944,7 @@ out_unreserve:
|
||||||
if (commit_lsn == -1 && !error)
|
if (commit_lsn == -1 && !error)
|
||||||
error = -EIO;
|
error = -EIO;
|
||||||
}
|
}
|
||||||
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
|
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
|
||||||
xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
|
xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
|
||||||
xfs_trans_free(tp);
|
xfs_trans_free(tp);
|
||||||
|
|
||||||
|
@ -998,7 +998,7 @@ xfs_trans_cancel(
|
||||||
xfs_log_done(mp, tp->t_ticket, NULL, false);
|
xfs_log_done(mp, tp->t_ticket, NULL, false);
|
||||||
|
|
||||||
/* mark this thread as no longer being in a transaction */
|
/* mark this thread as no longer being in a transaction */
|
||||||
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
|
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
|
||||||
|
|
||||||
xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
|
xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
|
||||||
xfs_trans_free(tp);
|
xfs_trans_free(tp);
|
||||||
|
|
|
@ -40,6 +40,11 @@ struct vm_area_struct;
|
||||||
#define ___GFP_DIRECT_RECLAIM 0x400000u
|
#define ___GFP_DIRECT_RECLAIM 0x400000u
|
||||||
#define ___GFP_WRITE 0x800000u
|
#define ___GFP_WRITE 0x800000u
|
||||||
#define ___GFP_KSWAPD_RECLAIM 0x1000000u
|
#define ___GFP_KSWAPD_RECLAIM 0x1000000u
|
||||||
|
#ifdef CONFIG_LOCKDEP
|
||||||
|
#define ___GFP_NOLOCKDEP 0x4000000u
|
||||||
|
#else
|
||||||
|
#define ___GFP_NOLOCKDEP 0
|
||||||
|
#endif
|
||||||
/* If the above are modified, __GFP_BITS_SHIFT may need updating */
|
/* If the above are modified, __GFP_BITS_SHIFT may need updating */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -179,8 +184,11 @@ struct vm_area_struct;
|
||||||
#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK)
|
#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK)
|
||||||
#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
|
#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
|
||||||
|
|
||||||
|
/* Disable lockdep for GFP context tracking */
|
||||||
|
#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
|
||||||
|
|
||||||
/* Room for N __GFP_FOO bits */
|
/* Room for N __GFP_FOO bits */
|
||||||
#define __GFP_BITS_SHIFT 25
|
#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
|
||||||
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
|
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -202,8 +210,16 @@ struct vm_area_struct;
|
||||||
*
|
*
|
||||||
* GFP_NOIO will use direct reclaim to discard clean pages or slab pages
|
* GFP_NOIO will use direct reclaim to discard clean pages or slab pages
|
||||||
* that do not require the starting of any physical IO.
|
* that do not require the starting of any physical IO.
|
||||||
|
* Please try to avoid using this flag directly and instead use
|
||||||
|
* memalloc_noio_{save,restore} to mark the whole scope which cannot
|
||||||
|
* perform any IO with a short explanation why. All allocation requests
|
||||||
|
* will inherit GFP_NOIO implicitly.
|
||||||
*
|
*
|
||||||
* GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
|
* GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
|
||||||
|
* Please try to avoid using this flag directly and instead use
|
||||||
|
* memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't
|
||||||
|
* recurse into the FS layer with a short explanation why. All allocation
|
||||||
|
* requests will inherit GFP_NOFS implicitly.
|
||||||
*
|
*
|
||||||
* GFP_USER is for userspace allocations that also need to be directly
|
* GFP_USER is for userspace allocations that also need to be directly
|
||||||
* accessibly by the kernel or hardware. It is typically used by hardware
|
* accessibly by the kernel or hardware. It is typically used by hardware
|
||||||
|
@ -297,8 +313,8 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
|
* GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
|
||||||
* zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long
|
* zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT
|
||||||
* and there are 16 of them to cover all possible combinations of
|
* bits long and there are 16 of them to cover all possible combinations of
|
||||||
* __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
|
* __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
|
||||||
*
|
*
|
||||||
* The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
|
* The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
|
||||||
|
|
|
@ -491,6 +491,8 @@ struct jbd2_journal_handle
|
||||||
|
|
||||||
unsigned long h_start_jiffies;
|
unsigned long h_start_jiffies;
|
||||||
unsigned int h_requested_credits;
|
unsigned int h_requested_credits;
|
||||||
|
|
||||||
|
unsigned int saved_alloc_context;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -61,7 +61,7 @@ static inline void set_page_stable_node(struct page *page,
|
||||||
struct page *ksm_might_need_to_copy(struct page *page,
|
struct page *ksm_might_need_to_copy(struct page *page,
|
||||||
struct vm_area_struct *vma, unsigned long address);
|
struct vm_area_struct *vma, unsigned long address);
|
||||||
|
|
||||||
int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
|
void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
|
||||||
void ksm_migrate_page(struct page *newpage, struct page *oldpage);
|
void ksm_migrate_page(struct page *newpage, struct page *oldpage);
|
||||||
|
|
||||||
#else /* !CONFIG_KSM */
|
#else /* !CONFIG_KSM */
|
||||||
|
@ -94,10 +94,9 @@ static inline int page_referenced_ksm(struct page *page,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int rmap_walk_ksm(struct page *page,
|
static inline void rmap_walk_ksm(struct page *page,
|
||||||
struct rmap_walk_control *rwc)
|
struct rmap_walk_control *rwc)
|
||||||
{
|
{
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
|
static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
|
||||||
|
|
|
@ -35,48 +35,43 @@ struct page;
|
||||||
struct mm_struct;
|
struct mm_struct;
|
||||||
struct kmem_cache;
|
struct kmem_cache;
|
||||||
|
|
||||||
/*
|
/* Cgroup-specific page state, on top of universal node page state */
|
||||||
* The corresponding mem_cgroup_stat_names is defined in mm/memcontrol.c,
|
enum memcg_stat_item {
|
||||||
* These two lists should keep in accord with each other.
|
MEMCG_CACHE = NR_VM_NODE_STAT_ITEMS,
|
||||||
*/
|
MEMCG_RSS,
|
||||||
enum mem_cgroup_stat_index {
|
MEMCG_RSS_HUGE,
|
||||||
/*
|
MEMCG_SWAP,
|
||||||
* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
|
MEMCG_SOCK,
|
||||||
*/
|
/* XXX: why are these zone and not node counters? */
|
||||||
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
|
MEMCG_KERNEL_STACK_KB,
|
||||||
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
|
|
||||||
MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */
|
|
||||||
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
|
|
||||||
MEM_CGROUP_STAT_DIRTY, /* # of dirty pages in page cache */
|
|
||||||
MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */
|
|
||||||
MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
|
|
||||||
MEM_CGROUP_STAT_NSTATS,
|
|
||||||
/* default hierarchy stats */
|
|
||||||
MEMCG_KERNEL_STACK_KB = MEM_CGROUP_STAT_NSTATS,
|
|
||||||
MEMCG_SLAB_RECLAIMABLE,
|
MEMCG_SLAB_RECLAIMABLE,
|
||||||
MEMCG_SLAB_UNRECLAIMABLE,
|
MEMCG_SLAB_UNRECLAIMABLE,
|
||||||
MEMCG_SOCK,
|
|
||||||
MEMCG_NR_STAT,
|
MEMCG_NR_STAT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* Cgroup-specific events, on top of universal VM events */
|
||||||
|
enum memcg_event_item {
|
||||||
|
MEMCG_LOW = NR_VM_EVENT_ITEMS,
|
||||||
|
MEMCG_HIGH,
|
||||||
|
MEMCG_MAX,
|
||||||
|
MEMCG_OOM,
|
||||||
|
MEMCG_NR_EVENTS,
|
||||||
|
};
|
||||||
|
|
||||||
struct mem_cgroup_reclaim_cookie {
|
struct mem_cgroup_reclaim_cookie {
|
||||||
pg_data_t *pgdat;
|
pg_data_t *pgdat;
|
||||||
int priority;
|
int priority;
|
||||||
unsigned int generation;
|
unsigned int generation;
|
||||||
};
|
};
|
||||||
|
|
||||||
enum mem_cgroup_events_index {
|
#ifdef CONFIG_MEMCG
|
||||||
MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
|
|
||||||
MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
|
#define MEM_CGROUP_ID_SHIFT 16
|
||||||
MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
|
#define MEM_CGROUP_ID_MAX USHRT_MAX
|
||||||
MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
|
|
||||||
MEM_CGROUP_EVENTS_NSTATS,
|
struct mem_cgroup_id {
|
||||||
/* default hierarchy events */
|
int id;
|
||||||
MEMCG_LOW = MEM_CGROUP_EVENTS_NSTATS,
|
atomic_t ref;
|
||||||
MEMCG_HIGH,
|
|
||||||
MEMCG_MAX,
|
|
||||||
MEMCG_OOM,
|
|
||||||
MEMCG_NR_EVENTS,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -92,16 +87,6 @@ enum mem_cgroup_events_target {
|
||||||
MEM_CGROUP_NTARGETS,
|
MEM_CGROUP_NTARGETS,
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef CONFIG_MEMCG
|
|
||||||
|
|
||||||
#define MEM_CGROUP_ID_SHIFT 16
|
|
||||||
#define MEM_CGROUP_ID_MAX USHRT_MAX
|
|
||||||
|
|
||||||
struct mem_cgroup_id {
|
|
||||||
int id;
|
|
||||||
atomic_t ref;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct mem_cgroup_stat_cpu {
|
struct mem_cgroup_stat_cpu {
|
||||||
long count[MEMCG_NR_STAT];
|
long count[MEMCG_NR_STAT];
|
||||||
unsigned long events[MEMCG_NR_EVENTS];
|
unsigned long events[MEMCG_NR_EVENTS];
|
||||||
|
@ -283,17 +268,10 @@ static inline bool mem_cgroup_disabled(void)
|
||||||
return !cgroup_subsys_enabled(memory_cgrp_subsys);
|
return !cgroup_subsys_enabled(memory_cgrp_subsys);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
static inline void mem_cgroup_event(struct mem_cgroup *memcg,
|
||||||
* mem_cgroup_events - count memory events against a cgroup
|
enum memcg_event_item event)
|
||||||
* @memcg: the memory cgroup
|
|
||||||
* @idx: the event index
|
|
||||||
* @nr: the number of events to account for
|
|
||||||
*/
|
|
||||||
static inline void mem_cgroup_events(struct mem_cgroup *memcg,
|
|
||||||
enum mem_cgroup_events_index idx,
|
|
||||||
unsigned int nr)
|
|
||||||
{
|
{
|
||||||
this_cpu_add(memcg->stat->events[idx], nr);
|
this_cpu_inc(memcg->stat->events[event]);
|
||||||
cgroup_file_notify(&memcg->events_file);
|
cgroup_file_notify(&memcg->events_file);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -494,8 +472,42 @@ extern int do_swap_account;
|
||||||
void lock_page_memcg(struct page *page);
|
void lock_page_memcg(struct page *page);
|
||||||
void unlock_page_memcg(struct page *page);
|
void unlock_page_memcg(struct page *page);
|
||||||
|
|
||||||
|
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
|
||||||
|
enum memcg_stat_item idx)
|
||||||
|
{
|
||||||
|
long val = 0;
|
||||||
|
int cpu;
|
||||||
|
|
||||||
|
for_each_possible_cpu(cpu)
|
||||||
|
val += per_cpu(memcg->stat->count[idx], cpu);
|
||||||
|
|
||||||
|
if (val < 0)
|
||||||
|
val = 0;
|
||||||
|
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void mod_memcg_state(struct mem_cgroup *memcg,
|
||||||
|
enum memcg_stat_item idx, int val)
|
||||||
|
{
|
||||||
|
if (!mem_cgroup_disabled())
|
||||||
|
this_cpu_add(memcg->stat->count[idx], val);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void inc_memcg_state(struct mem_cgroup *memcg,
|
||||||
|
enum memcg_stat_item idx)
|
||||||
|
{
|
||||||
|
mod_memcg_state(memcg, idx, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void dec_memcg_state(struct mem_cgroup *memcg,
|
||||||
|
enum memcg_stat_item idx)
|
||||||
|
{
|
||||||
|
mod_memcg_state(memcg, idx, -1);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* mem_cgroup_update_page_stat - update page state statistics
|
* mod_memcg_page_state - update page state statistics
|
||||||
* @page: the page
|
* @page: the page
|
||||||
* @idx: page state item to account
|
* @idx: page state item to account
|
||||||
* @val: number of pages (positive or negative)
|
* @val: number of pages (positive or negative)
|
||||||
|
@ -506,28 +518,28 @@ void unlock_page_memcg(struct page *page);
|
||||||
*
|
*
|
||||||
* lock_page(page) or lock_page_memcg(page)
|
* lock_page(page) or lock_page_memcg(page)
|
||||||
* if (TestClearPageState(page))
|
* if (TestClearPageState(page))
|
||||||
* mem_cgroup_update_page_stat(page, state, -1);
|
* mod_memcg_page_state(page, state, -1);
|
||||||
* unlock_page(page) or unlock_page_memcg(page)
|
* unlock_page(page) or unlock_page_memcg(page)
|
||||||
|
*
|
||||||
|
* Kernel pages are an exception to this, since they'll never move.
|
||||||
*/
|
*/
|
||||||
static inline void mem_cgroup_update_page_stat(struct page *page,
|
static inline void mod_memcg_page_state(struct page *page,
|
||||||
enum mem_cgroup_stat_index idx, int val)
|
enum memcg_stat_item idx, int val)
|
||||||
{
|
{
|
||||||
VM_BUG_ON(!(rcu_read_lock_held() || PageLocked(page)));
|
|
||||||
|
|
||||||
if (page->mem_cgroup)
|
if (page->mem_cgroup)
|
||||||
this_cpu_add(page->mem_cgroup->stat->count[idx], val);
|
mod_memcg_state(page->mem_cgroup, idx, val);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void mem_cgroup_inc_page_stat(struct page *page,
|
static inline void inc_memcg_page_state(struct page *page,
|
||||||
enum mem_cgroup_stat_index idx)
|
enum memcg_stat_item idx)
|
||||||
{
|
{
|
||||||
mem_cgroup_update_page_stat(page, idx, 1);
|
mod_memcg_page_state(page, idx, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void mem_cgroup_dec_page_stat(struct page *page,
|
static inline void dec_memcg_page_state(struct page *page,
|
||||||
enum mem_cgroup_stat_index idx)
|
enum memcg_stat_item idx)
|
||||||
{
|
{
|
||||||
mem_cgroup_update_page_stat(page, idx, -1);
|
mod_memcg_page_state(page, idx, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
|
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
|
||||||
|
@ -544,20 +556,8 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
|
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
|
||||||
if (unlikely(!memcg))
|
if (likely(memcg))
|
||||||
goto out;
|
this_cpu_inc(memcg->stat->events[idx]);
|
||||||
|
|
||||||
switch (idx) {
|
|
||||||
case PGFAULT:
|
|
||||||
this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
|
|
||||||
break;
|
|
||||||
case PGMAJFAULT:
|
|
||||||
this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
BUG();
|
|
||||||
}
|
|
||||||
out:
|
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
}
|
}
|
||||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||||
|
@ -576,9 +576,8 @@ static inline bool mem_cgroup_disabled(void)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void mem_cgroup_events(struct mem_cgroup *memcg,
|
static inline void mem_cgroup_event(struct mem_cgroup *memcg,
|
||||||
enum mem_cgroup_events_index idx,
|
enum memcg_event_item event)
|
||||||
unsigned int nr)
|
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -740,19 +739,41 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void mem_cgroup_update_page_stat(struct page *page,
|
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
|
||||||
enum mem_cgroup_stat_index idx,
|
enum memcg_stat_item idx)
|
||||||
int nr)
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void mod_memcg_state(struct mem_cgroup *memcg,
|
||||||
|
enum memcg_stat_item idx,
|
||||||
|
int nr)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void mem_cgroup_inc_page_stat(struct page *page,
|
static inline void inc_memcg_state(struct mem_cgroup *memcg,
|
||||||
enum mem_cgroup_stat_index idx)
|
enum memcg_stat_item idx)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void mem_cgroup_dec_page_stat(struct page *page,
|
static inline void dec_memcg_state(struct mem_cgroup *memcg,
|
||||||
enum mem_cgroup_stat_index idx)
|
enum memcg_stat_item idx)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void mod_memcg_page_state(struct page *page,
|
||||||
|
enum memcg_stat_item idx,
|
||||||
|
int nr)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void inc_memcg_page_state(struct page *page,
|
||||||
|
enum memcg_stat_item idx)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void dec_memcg_page_state(struct page *page,
|
||||||
|
enum memcg_stat_item idx)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -872,7 +893,7 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
|
||||||
* @val: number of pages (positive or negative)
|
* @val: number of pages (positive or negative)
|
||||||
*/
|
*/
|
||||||
static inline void memcg_kmem_update_page_stat(struct page *page,
|
static inline void memcg_kmem_update_page_stat(struct page *page,
|
||||||
enum mem_cgroup_stat_index idx, int val)
|
enum memcg_stat_item idx, int val)
|
||||||
{
|
{
|
||||||
if (memcg_kmem_enabled() && page->mem_cgroup)
|
if (memcg_kmem_enabled() && page->mem_cgroup)
|
||||||
this_cpu_add(page->mem_cgroup->stat->count[idx], val);
|
this_cpu_add(page->mem_cgroup->stat->count[idx], val);
|
||||||
|
@ -901,7 +922,7 @@ static inline void memcg_put_cache_ids(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void memcg_kmem_update_page_stat(struct page *page,
|
static inline void memcg_kmem_update_page_stat(struct page *page,
|
||||||
enum mem_cgroup_stat_index idx, int val)
|
enum memcg_stat_item idx, int val)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
|
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
|
||||||
|
|
|
@ -33,8 +33,9 @@ extern char *migrate_reason_names[MR_TYPES];
|
||||||
#ifdef CONFIG_MIGRATION
|
#ifdef CONFIG_MIGRATION
|
||||||
|
|
||||||
extern void putback_movable_pages(struct list_head *l);
|
extern void putback_movable_pages(struct list_head *l);
|
||||||
extern int migrate_page(struct address_space *,
|
extern int migrate_page(struct address_space *mapping,
|
||||||
struct page *, struct page *, enum migrate_mode);
|
struct page *newpage, struct page *page,
|
||||||
|
enum migrate_mode mode);
|
||||||
extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
|
extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
|
||||||
unsigned long private, enum migrate_mode mode, int reason);
|
unsigned long private, enum migrate_mode mode, int reason);
|
||||||
extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
|
extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
|
||||||
|
|
|
@ -2487,7 +2487,6 @@ extern long copy_huge_page_from_user(struct page *dst_page,
|
||||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
|
||||||
|
|
||||||
extern struct page_ext_operations debug_guardpage_ops;
|
extern struct page_ext_operations debug_guardpage_ops;
|
||||||
extern struct page_ext_operations page_poisoning_ops;
|
|
||||||
|
|
||||||
#ifdef CONFIG_DEBUG_PAGEALLOC
|
#ifdef CONFIG_DEBUG_PAGEALLOC
|
||||||
extern unsigned int _debug_guardpage_minorder;
|
extern unsigned int _debug_guardpage_minorder;
|
||||||
|
|
|
@ -35,7 +35,7 @@
|
||||||
*/
|
*/
|
||||||
#define PAGE_ALLOC_COSTLY_ORDER 3
|
#define PAGE_ALLOC_COSTLY_ORDER 3
|
||||||
|
|
||||||
enum {
|
enum migratetype {
|
||||||
MIGRATE_UNMOVABLE,
|
MIGRATE_UNMOVABLE,
|
||||||
MIGRATE_MOVABLE,
|
MIGRATE_MOVABLE,
|
||||||
MIGRATE_RECLAIMABLE,
|
MIGRATE_RECLAIMABLE,
|
||||||
|
@ -149,7 +149,6 @@ enum node_stat_item {
|
||||||
NR_UNEVICTABLE, /* " " " " " */
|
NR_UNEVICTABLE, /* " " " " " */
|
||||||
NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
|
NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
|
||||||
NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
|
NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
|
||||||
NR_PAGES_SCANNED, /* pages scanned since last reclaim */
|
|
||||||
WORKINGSET_REFAULT,
|
WORKINGSET_REFAULT,
|
||||||
WORKINGSET_ACTIVATE,
|
WORKINGSET_ACTIVATE,
|
||||||
WORKINGSET_NODERECLAIM,
|
WORKINGSET_NODERECLAIM,
|
||||||
|
@ -226,6 +225,8 @@ struct lruvec {
|
||||||
struct zone_reclaim_stat reclaim_stat;
|
struct zone_reclaim_stat reclaim_stat;
|
||||||
/* Evictions & activations on the inactive file list */
|
/* Evictions & activations on the inactive file list */
|
||||||
atomic_long_t inactive_age;
|
atomic_long_t inactive_age;
|
||||||
|
/* Refaults at the time of last reclaim cycle */
|
||||||
|
unsigned long refaults;
|
||||||
#ifdef CONFIG_MEMCG
|
#ifdef CONFIG_MEMCG
|
||||||
struct pglist_data *pgdat;
|
struct pglist_data *pgdat;
|
||||||
#endif
|
#endif
|
||||||
|
@ -630,6 +631,8 @@ typedef struct pglist_data {
|
||||||
int kswapd_order;
|
int kswapd_order;
|
||||||
enum zone_type kswapd_classzone_idx;
|
enum zone_type kswapd_classzone_idx;
|
||||||
|
|
||||||
|
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
|
||||||
|
|
||||||
#ifdef CONFIG_COMPACTION
|
#ifdef CONFIG_COMPACTION
|
||||||
int kcompactd_max_order;
|
int kcompactd_max_order;
|
||||||
enum zone_type kcompactd_classzone_idx;
|
enum zone_type kcompactd_classzone_idx;
|
||||||
|
|
|
@ -83,19 +83,17 @@ struct anon_vma_chain {
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ttu_flags {
|
enum ttu_flags {
|
||||||
TTU_UNMAP = 1, /* unmap mode */
|
TTU_MIGRATION = 0x1, /* migration mode */
|
||||||
TTU_MIGRATION = 2, /* migration mode */
|
TTU_MUNLOCK = 0x2, /* munlock mode */
|
||||||
TTU_MUNLOCK = 4, /* munlock mode */
|
|
||||||
TTU_LZFREE = 8, /* lazy free mode */
|
|
||||||
TTU_SPLIT_HUGE_PMD = 16, /* split huge PMD if any */
|
|
||||||
|
|
||||||
TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
|
TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */
|
||||||
TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
|
TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */
|
||||||
TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
|
TTU_IGNORE_ACCESS = 0x10, /* don't age */
|
||||||
TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible
|
TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */
|
||||||
|
TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible
|
||||||
* and caller guarantees they will
|
* and caller guarantees they will
|
||||||
* do a final flush if necessary */
|
* do a final flush if necessary */
|
||||||
TTU_RMAP_LOCKED = (1 << 12) /* do not grab rmap lock:
|
TTU_RMAP_LOCKED = 0x80 /* do not grab rmap lock:
|
||||||
* caller holds it */
|
* caller holds it */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -193,9 +191,7 @@ static inline void page_dup_rmap(struct page *page, bool compound)
|
||||||
int page_referenced(struct page *, int is_locked,
|
int page_referenced(struct page *, int is_locked,
|
||||||
struct mem_cgroup *memcg, unsigned long *vm_flags);
|
struct mem_cgroup *memcg, unsigned long *vm_flags);
|
||||||
|
|
||||||
#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
|
bool try_to_unmap(struct page *, enum ttu_flags flags);
|
||||||
|
|
||||||
int try_to_unmap(struct page *, enum ttu_flags flags);
|
|
||||||
|
|
||||||
/* Avoid racy checks */
|
/* Avoid racy checks */
|
||||||
#define PVMW_SYNC (1 << 0)
|
#define PVMW_SYNC (1 << 0)
|
||||||
|
@ -239,7 +235,7 @@ int page_mkclean(struct page *);
|
||||||
* called in munlock()/munmap() path to check for other vmas holding
|
* called in munlock()/munmap() path to check for other vmas holding
|
||||||
* the page mlocked.
|
* the page mlocked.
|
||||||
*/
|
*/
|
||||||
int try_to_munlock(struct page *);
|
void try_to_munlock(struct page *);
|
||||||
|
|
||||||
void remove_migration_ptes(struct page *old, struct page *new, bool locked);
|
void remove_migration_ptes(struct page *old, struct page *new, bool locked);
|
||||||
|
|
||||||
|
@ -261,15 +257,19 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
|
||||||
*/
|
*/
|
||||||
struct rmap_walk_control {
|
struct rmap_walk_control {
|
||||||
void *arg;
|
void *arg;
|
||||||
int (*rmap_one)(struct page *page, struct vm_area_struct *vma,
|
/*
|
||||||
|
* Return false if page table scanning in rmap_walk should be stopped.
|
||||||
|
* Otherwise, return true.
|
||||||
|
*/
|
||||||
|
bool (*rmap_one)(struct page *page, struct vm_area_struct *vma,
|
||||||
unsigned long addr, void *arg);
|
unsigned long addr, void *arg);
|
||||||
int (*done)(struct page *page);
|
int (*done)(struct page *page);
|
||||||
struct anon_vma *(*anon_lock)(struct page *page);
|
struct anon_vma *(*anon_lock)(struct page *page);
|
||||||
bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
|
bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
|
||||||
};
|
};
|
||||||
|
|
||||||
int rmap_walk(struct page *page, struct rmap_walk_control *rwc);
|
void rmap_walk(struct page *page, struct rmap_walk_control *rwc);
|
||||||
int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);
|
void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);
|
||||||
|
|
||||||
#else /* !CONFIG_MMU */
|
#else /* !CONFIG_MMU */
|
||||||
|
|
||||||
|
@ -285,7 +285,7 @@ static inline int page_referenced(struct page *page, int is_locked,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define try_to_unmap(page, refs) SWAP_FAIL
|
#define try_to_unmap(page, refs) false
|
||||||
|
|
||||||
static inline int page_mkclean(struct page *page)
|
static inline int page_mkclean(struct page *page)
|
||||||
{
|
{
|
||||||
|
@ -295,13 +295,4 @@ static inline int page_mkclean(struct page *page)
|
||||||
|
|
||||||
#endif /* CONFIG_MMU */
|
#endif /* CONFIG_MMU */
|
||||||
|
|
||||||
/*
|
|
||||||
* Return values of try_to_unmap
|
|
||||||
*/
|
|
||||||
#define SWAP_SUCCESS 0
|
|
||||||
#define SWAP_AGAIN 1
|
|
||||||
#define SWAP_FAIL 2
|
|
||||||
#define SWAP_MLOCK 3
|
|
||||||
#define SWAP_LZFREE 4
|
|
||||||
|
|
||||||
#endif /* _LINUX_RMAP_H */
|
#endif /* _LINUX_RMAP_H */
|
||||||
|
|
|
@ -14,7 +14,6 @@
|
||||||
#define _RODATA_TEST_H
|
#define _RODATA_TEST_H
|
||||||
|
|
||||||
#ifdef CONFIG_DEBUG_RODATA_TEST
|
#ifdef CONFIG_DEBUG_RODATA_TEST
|
||||||
extern const int rodata_test_data;
|
|
||||||
void rodata_test(void);
|
void rodata_test(void);
|
||||||
#else
|
#else
|
||||||
static inline void rodata_test(void) {}
|
static inline void rodata_test(void) {}
|
||||||
|
|
|
@ -1224,9 +1224,9 @@ extern struct pid *cad_pid;
|
||||||
#define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */
|
#define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */
|
||||||
#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
|
#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
|
||||||
#define PF_FROZEN 0x00010000 /* Frozen for system suspend */
|
#define PF_FROZEN 0x00010000 /* Frozen for system suspend */
|
||||||
#define PF_FSTRANS 0x00020000 /* Inside a filesystem transaction */
|
#define PF_KSWAPD 0x00020000 /* I am kswapd */
|
||||||
#define PF_KSWAPD 0x00040000 /* I am kswapd */
|
#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */
|
||||||
#define PF_MEMALLOC_NOIO 0x00080000 /* Allocating memory without IO involved */
|
#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */
|
||||||
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
|
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
|
||||||
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
|
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
|
||||||
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
|
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
|
||||||
|
|
|
@ -149,13 +149,21 @@ static inline bool in_vfork(struct task_struct *tsk)
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
|
/*
|
||||||
* __GFP_FS is also cleared as it implies __GFP_IO.
|
* Applies per-task gfp context to the given allocation flags.
|
||||||
|
* PF_MEMALLOC_NOIO implies GFP_NOIO
|
||||||
|
* PF_MEMALLOC_NOFS implies GFP_NOFS
|
||||||
*/
|
*/
|
||||||
static inline gfp_t memalloc_noio_flags(gfp_t flags)
|
static inline gfp_t current_gfp_context(gfp_t flags)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* NOIO implies both NOIO and NOFS and it is a weaker context
|
||||||
|
* so always make sure it makes precendence
|
||||||
|
*/
|
||||||
if (unlikely(current->flags & PF_MEMALLOC_NOIO))
|
if (unlikely(current->flags & PF_MEMALLOC_NOIO))
|
||||||
flags &= ~(__GFP_IO | __GFP_FS);
|
flags &= ~(__GFP_IO | __GFP_FS);
|
||||||
|
else if (unlikely(current->flags & PF_MEMALLOC_NOFS))
|
||||||
|
flags &= ~__GFP_FS;
|
||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -171,4 +179,16 @@ static inline void memalloc_noio_restore(unsigned int flags)
|
||||||
current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
|
current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline unsigned int memalloc_nofs_save(void)
|
||||||
|
{
|
||||||
|
unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
|
||||||
|
current->flags |= PF_MEMALLOC_NOFS;
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void memalloc_nofs_restore(unsigned int flags)
|
||||||
|
{
|
||||||
|
current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* _LINUX_SCHED_MM_H */
|
#endif /* _LINUX_SCHED_MM_H */
|
||||||
|
|
|
@ -279,7 +279,7 @@ extern void lru_add_drain_cpu(int cpu);
|
||||||
extern void lru_add_drain_all(void);
|
extern void lru_add_drain_all(void);
|
||||||
extern void rotate_reclaimable_page(struct page *page);
|
extern void rotate_reclaimable_page(struct page *page);
|
||||||
extern void deactivate_file_page(struct page *page);
|
extern void deactivate_file_page(struct page *page);
|
||||||
extern void deactivate_page(struct page *page);
|
extern void mark_page_lazyfree(struct page *page);
|
||||||
extern void swap_setup(void);
|
extern void swap_setup(void);
|
||||||
|
|
||||||
extern void add_page_to_unevictable_list(struct page *page);
|
extern void add_page_to_unevictable_list(struct page *page);
|
||||||
|
@ -411,9 +411,6 @@ struct backing_dev_info;
|
||||||
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
|
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
|
||||||
extern void exit_swap_address_space(unsigned int type);
|
extern void exit_swap_address_space(unsigned int type);
|
||||||
|
|
||||||
extern int get_swap_slots(int n, swp_entry_t *slots);
|
|
||||||
extern void swapcache_free_batch(swp_entry_t *entries, int n);
|
|
||||||
|
|
||||||
#else /* CONFIG_SWAP */
|
#else /* CONFIG_SWAP */
|
||||||
|
|
||||||
#define swap_address_space(entry) (NULL)
|
#define swap_address_space(entry) (NULL)
|
||||||
|
|
|
@ -25,7 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
|
||||||
FOR_ALL_ZONES(PGALLOC),
|
FOR_ALL_ZONES(PGALLOC),
|
||||||
FOR_ALL_ZONES(ALLOCSTALL),
|
FOR_ALL_ZONES(ALLOCSTALL),
|
||||||
FOR_ALL_ZONES(PGSCAN_SKIP),
|
FOR_ALL_ZONES(PGSCAN_SKIP),
|
||||||
PGFREE, PGACTIVATE, PGDEACTIVATE,
|
PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE,
|
||||||
PGFAULT, PGMAJFAULT,
|
PGFAULT, PGMAJFAULT,
|
||||||
PGLAZYFREED,
|
PGLAZYFREED,
|
||||||
PGREFILL,
|
PGREFILL,
|
||||||
|
|
|
@ -30,6 +30,7 @@
|
||||||
#include <linux/sched.h>
|
#include <linux/sched.h>
|
||||||
#include <linux/sched/clock.h>
|
#include <linux/sched/clock.h>
|
||||||
#include <linux/sched/task.h>
|
#include <linux/sched/task.h>
|
||||||
|
#include <linux/sched/mm.h>
|
||||||
#include <linux/delay.h>
|
#include <linux/delay.h>
|
||||||
#include <linux/module.h>
|
#include <linux/module.h>
|
||||||
#include <linux/proc_fs.h>
|
#include <linux/proc_fs.h>
|
||||||
|
@ -2876,6 +2877,8 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
|
||||||
if (unlikely(!debug_locks))
|
if (unlikely(!debug_locks))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
gfp_mask = current_gfp_context(gfp_mask);
|
||||||
|
|
||||||
/* no reclaim without waiting on it */
|
/* no reclaim without waiting on it */
|
||||||
if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
|
if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
|
||||||
return;
|
return;
|
||||||
|
@ -2885,7 +2888,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/* We're only interested __GFP_FS allocations for now */
|
/* We're only interested __GFP_FS allocations for now */
|
||||||
if (!(gfp_mask & __GFP_FS))
|
if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -2894,6 +2897,10 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
|
||||||
if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
|
if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
/* Disable lockdep if explicitly requested */
|
||||||
|
if (gfp_mask & __GFP_NOLOCKDEP)
|
||||||
|
return;
|
||||||
|
|
||||||
mark_held_locks(curr, RECLAIM_FS);
|
mark_held_locks(curr, RECLAIM_FS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3947,7 +3954,7 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock);
|
||||||
|
|
||||||
void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
|
void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
|
||||||
{
|
{
|
||||||
current->lockdep_reclaim_gfp = gfp_mask;
|
current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask);
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state);
|
EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state);
|
||||||
|
|
||||||
|
|
|
@ -942,21 +942,17 @@ static int device_dma_allocations(struct device *dev, struct dma_debug_entry **o
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
int count = 0, i;
|
int count = 0, i;
|
||||||
|
|
||||||
local_irq_save(flags);
|
|
||||||
|
|
||||||
for (i = 0; i < HASH_SIZE; ++i) {
|
for (i = 0; i < HASH_SIZE; ++i) {
|
||||||
spin_lock(&dma_entry_hash[i].lock);
|
spin_lock_irqsave(&dma_entry_hash[i].lock, flags);
|
||||||
list_for_each_entry(entry, &dma_entry_hash[i].list, list) {
|
list_for_each_entry(entry, &dma_entry_hash[i].list, list) {
|
||||||
if (entry->dev == dev) {
|
if (entry->dev == dev) {
|
||||||
count += 1;
|
count += 1;
|
||||||
*out_entry = entry;
|
*out_entry = entry;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
spin_unlock(&dma_entry_hash[i].lock);
|
spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
local_irq_restore(flags);
|
|
||||||
|
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2284,6 +2284,8 @@ static int radix_tree_cpu_dead(unsigned int cpu)
|
||||||
void __init radix_tree_init(void)
|
void __init radix_tree_init(void)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
|
BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
|
||||||
radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
|
radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
|
||||||
sizeof(struct radix_tree_node), 0,
|
sizeof(struct radix_tree_node), 0,
|
||||||
SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
|
SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
|
||||||
|
|
|
@ -42,7 +42,6 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT
|
||||||
|
|
||||||
config PAGE_POISONING
|
config PAGE_POISONING
|
||||||
bool "Poison pages after freeing"
|
bool "Poison pages after freeing"
|
||||||
select PAGE_EXTENSION
|
|
||||||
select PAGE_POISONING_NO_SANITY if HIBERNATION
|
select PAGE_POISONING_NO_SANITY if HIBERNATION
|
||||||
---help---
|
---help---
|
||||||
Fill the pages with poison patterns after free_pages() and verify
|
Fill the pages with poison patterns after free_pages() and verify
|
||||||
|
|
|
@ -992,9 +992,6 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
|
||||||
static bool suitable_migration_target(struct compact_control *cc,
|
static bool suitable_migration_target(struct compact_control *cc,
|
||||||
struct page *page)
|
struct page *page)
|
||||||
{
|
{
|
||||||
if (cc->ignore_block_suitable)
|
|
||||||
return true;
|
|
||||||
|
|
||||||
/* If the page is a large free page, then disallow migration */
|
/* If the page is a large free page, then disallow migration */
|
||||||
if (PageBuddy(page)) {
|
if (PageBuddy(page)) {
|
||||||
/*
|
/*
|
||||||
|
@ -1006,6 +1003,9 @@ static bool suitable_migration_target(struct compact_control *cc,
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (cc->ignore_block_suitable)
|
||||||
|
return true;
|
||||||
|
|
||||||
/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
|
/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
|
||||||
if (migrate_async_suitable(get_pageblock_migratetype(page)))
|
if (migrate_async_suitable(get_pageblock_migratetype(page)))
|
||||||
return true;
|
return true;
|
||||||
|
|
42
mm/filemap.c
42
mm/filemap.c
|
@ -2204,12 +2204,12 @@ int filemap_fault(struct vm_fault *vmf)
|
||||||
struct file_ra_state *ra = &file->f_ra;
|
struct file_ra_state *ra = &file->f_ra;
|
||||||
struct inode *inode = mapping->host;
|
struct inode *inode = mapping->host;
|
||||||
pgoff_t offset = vmf->pgoff;
|
pgoff_t offset = vmf->pgoff;
|
||||||
|
pgoff_t max_off;
|
||||||
struct page *page;
|
struct page *page;
|
||||||
loff_t size;
|
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
size = round_up(i_size_read(inode), PAGE_SIZE);
|
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
|
||||||
if (offset >= size >> PAGE_SHIFT)
|
if (unlikely(offset >= max_off))
|
||||||
return VM_FAULT_SIGBUS;
|
return VM_FAULT_SIGBUS;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -2258,8 +2258,8 @@ retry_find:
|
||||||
* Found the page and have a reference on it.
|
* Found the page and have a reference on it.
|
||||||
* We must recheck i_size under page lock.
|
* We must recheck i_size under page lock.
|
||||||
*/
|
*/
|
||||||
size = round_up(i_size_read(inode), PAGE_SIZE);
|
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
|
||||||
if (unlikely(offset >= size >> PAGE_SHIFT)) {
|
if (unlikely(offset >= max_off)) {
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
put_page(page);
|
put_page(page);
|
||||||
return VM_FAULT_SIGBUS;
|
return VM_FAULT_SIGBUS;
|
||||||
|
@ -2325,7 +2325,7 @@ void filemap_map_pages(struct vm_fault *vmf,
|
||||||
struct file *file = vmf->vma->vm_file;
|
struct file *file = vmf->vma->vm_file;
|
||||||
struct address_space *mapping = file->f_mapping;
|
struct address_space *mapping = file->f_mapping;
|
||||||
pgoff_t last_pgoff = start_pgoff;
|
pgoff_t last_pgoff = start_pgoff;
|
||||||
loff_t size;
|
unsigned long max_idx;
|
||||||
struct page *head, *page;
|
struct page *head, *page;
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
|
@ -2371,8 +2371,8 @@ repeat:
|
||||||
if (page->mapping != mapping || !PageUptodate(page))
|
if (page->mapping != mapping || !PageUptodate(page))
|
||||||
goto unlock;
|
goto unlock;
|
||||||
|
|
||||||
size = round_up(i_size_read(mapping->host), PAGE_SIZE);
|
max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
|
||||||
if (page->index >= size >> PAGE_SHIFT)
|
if (page->index >= max_idx)
|
||||||
goto unlock;
|
goto unlock;
|
||||||
|
|
||||||
if (file->f_ra.mmap_miss > 0)
|
if (file->f_ra.mmap_miss > 0)
|
||||||
|
@ -2720,18 +2720,16 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
|
||||||
* about to write. We do this *before* the write so that we can return
|
* about to write. We do this *before* the write so that we can return
|
||||||
* without clobbering -EIOCBQUEUED from ->direct_IO().
|
* without clobbering -EIOCBQUEUED from ->direct_IO().
|
||||||
*/
|
*/
|
||||||
if (mapping->nrpages) {
|
written = invalidate_inode_pages2_range(mapping,
|
||||||
written = invalidate_inode_pages2_range(mapping,
|
|
||||||
pos >> PAGE_SHIFT, end);
|
pos >> PAGE_SHIFT, end);
|
||||||
/*
|
/*
|
||||||
* If a page can not be invalidated, return 0 to fall back
|
* If a page can not be invalidated, return 0 to fall back
|
||||||
* to buffered write.
|
* to buffered write.
|
||||||
*/
|
*/
|
||||||
if (written) {
|
if (written) {
|
||||||
if (written == -EBUSY)
|
if (written == -EBUSY)
|
||||||
return 0;
|
return 0;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
written = mapping->a_ops->direct_IO(iocb, from);
|
written = mapping->a_ops->direct_IO(iocb, from);
|
||||||
|
@ -2744,10 +2742,8 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
|
||||||
* so we don't support it 100%. If this invalidation
|
* so we don't support it 100%. If this invalidation
|
||||||
* fails, tough, the write still worked...
|
* fails, tough, the write still worked...
|
||||||
*/
|
*/
|
||||||
if (mapping->nrpages) {
|
invalidate_inode_pages2_range(mapping,
|
||||||
invalidate_inode_pages2_range(mapping,
|
pos >> PAGE_SHIFT, end);
|
||||||
pos >> PAGE_SHIFT, end);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (written > 0) {
|
if (written > 0) {
|
||||||
pos += written;
|
pos += written;
|
||||||
|
|
2
mm/gup.c
2
mm/gup.c
|
@ -1575,7 +1575,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
|
||||||
end = start + len;
|
end = start + len;
|
||||||
|
|
||||||
if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
|
if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
|
||||||
start, len)))
|
(void __user *)start, len)))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -1564,9 +1564,6 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
||||||
ClearPageDirty(page);
|
ClearPageDirty(page);
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
|
|
||||||
if (PageActive(page))
|
|
||||||
deactivate_page(page);
|
|
||||||
|
|
||||||
if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
|
if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
|
||||||
pmdp_invalidate(vma, addr, pmd);
|
pmdp_invalidate(vma, addr, pmd);
|
||||||
orig_pmd = pmd_mkold(orig_pmd);
|
orig_pmd = pmd_mkold(orig_pmd);
|
||||||
|
@ -1575,6 +1572,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
||||||
set_pmd_at(mm, addr, pmd, orig_pmd);
|
set_pmd_at(mm, addr, pmd, orig_pmd);
|
||||||
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
|
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mark_page_lazyfree(page);
|
||||||
ret = true;
|
ret = true;
|
||||||
out:
|
out:
|
||||||
spin_unlock(ptl);
|
spin_unlock(ptl);
|
||||||
|
@ -2145,15 +2144,15 @@ static void freeze_page(struct page *page)
|
||||||
{
|
{
|
||||||
enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
|
enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
|
||||||
TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
|
TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
|
||||||
int ret;
|
bool unmap_success;
|
||||||
|
|
||||||
VM_BUG_ON_PAGE(!PageHead(page), page);
|
VM_BUG_ON_PAGE(!PageHead(page), page);
|
||||||
|
|
||||||
if (PageAnon(page))
|
if (PageAnon(page))
|
||||||
ttu_flags |= TTU_MIGRATION;
|
ttu_flags |= TTU_MIGRATION;
|
||||||
|
|
||||||
ret = try_to_unmap(page, ttu_flags);
|
unmap_success = try_to_unmap(page, ttu_flags);
|
||||||
VM_BUG_ON_PAGE(ret, page);
|
VM_BUG_ON_PAGE(!unmap_success, page);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void unfreeze_page(struct page *page)
|
static void unfreeze_page(struct page *page)
|
||||||
|
@ -2399,7 +2398,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||||
|
|
||||||
VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
|
VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
|
||||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||||
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
|
|
||||||
VM_BUG_ON_PAGE(!PageCompound(page), page);
|
VM_BUG_ON_PAGE(!PageCompound(page), page);
|
||||||
|
|
||||||
if (PageAnon(head)) {
|
if (PageAnon(head)) {
|
||||||
|
|
|
@ -34,8 +34,7 @@ static int hwpoison_inject(void *data, u64 val)
|
||||||
if (!hwpoison_filter_enable)
|
if (!hwpoison_filter_enable)
|
||||||
goto inject;
|
goto inject;
|
||||||
|
|
||||||
if (!PageLRU(hpage) && !PageHuge(p))
|
shake_page(hpage, 0);
|
||||||
shake_page(hpage, 0);
|
|
||||||
/*
|
/*
|
||||||
* This implies unable to support non-LRU pages.
|
* This implies unable to support non-LRU pages.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -80,12 +80,17 @@ static inline void set_page_refcounted(struct page *page)
|
||||||
|
|
||||||
extern unsigned long highest_memmap_pfn;
|
extern unsigned long highest_memmap_pfn;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Maximum number of reclaim retries without progress before the OOM
|
||||||
|
* killer is consider the only way forward.
|
||||||
|
*/
|
||||||
|
#define MAX_RECLAIM_RETRIES 16
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* in mm/vmscan.c:
|
* in mm/vmscan.c:
|
||||||
*/
|
*/
|
||||||
extern int isolate_lru_page(struct page *page);
|
extern int isolate_lru_page(struct page *page);
|
||||||
extern void putback_lru_page(struct page *page);
|
extern void putback_lru_page(struct page *page);
|
||||||
extern bool pgdat_reclaimable(struct pglist_data *pgdat);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* in mm/rmap.c:
|
* in mm/rmap.c:
|
||||||
|
@ -505,4 +510,14 @@ extern const struct trace_print_flags pageflag_names[];
|
||||||
extern const struct trace_print_flags vmaflag_names[];
|
extern const struct trace_print_flags vmaflag_names[];
|
||||||
extern const struct trace_print_flags gfpflag_names[];
|
extern const struct trace_print_flags gfpflag_names[];
|
||||||
|
|
||||||
|
static inline bool is_migrate_highatomic(enum migratetype migratetype)
|
||||||
|
{
|
||||||
|
return migratetype == MIGRATE_HIGHATOMIC;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool is_migrate_highatomic_page(struct page *page)
|
||||||
|
{
|
||||||
|
return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* __MM_INTERNAL_H */
|
#endif /* __MM_INTERNAL_H */
|
||||||
|
|
|
@ -577,7 +577,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
|
||||||
|
|
||||||
shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
|
shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
|
||||||
if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) {
|
if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) {
|
||||||
kasan_report_double_free(cache, object, shadow_byte);
|
kasan_report_double_free(cache, object,
|
||||||
|
__builtin_return_address(1));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -99,7 +99,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
|
||||||
void kasan_report(unsigned long addr, size_t size,
|
void kasan_report(unsigned long addr, size_t size,
|
||||||
bool is_write, unsigned long ip);
|
bool is_write, unsigned long ip);
|
||||||
void kasan_report_double_free(struct kmem_cache *cache, void *object,
|
void kasan_report_double_free(struct kmem_cache *cache, void *object,
|
||||||
s8 shadow);
|
void *ip);
|
||||||
|
|
||||||
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
|
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
|
||||||
void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
|
void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
|
||||||
|
|
|
@ -51,7 +51,13 @@ static const void *find_first_bad_addr(const void *addr, size_t size)
|
||||||
return first_bad_addr;
|
return first_bad_addr;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void print_error_description(struct kasan_access_info *info)
|
static bool addr_has_shadow(struct kasan_access_info *info)
|
||||||
|
{
|
||||||
|
return (info->access_addr >=
|
||||||
|
kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char *get_shadow_bug_type(struct kasan_access_info *info)
|
||||||
{
|
{
|
||||||
const char *bug_type = "unknown-crash";
|
const char *bug_type = "unknown-crash";
|
||||||
u8 *shadow_addr;
|
u8 *shadow_addr;
|
||||||
|
@ -98,12 +104,39 @@ static void print_error_description(struct kasan_access_info *info)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
pr_err("BUG: KASAN: %s in %pS at addr %p\n",
|
return bug_type;
|
||||||
bug_type, (void *)info->ip,
|
}
|
||||||
info->access_addr);
|
|
||||||
pr_err("%s of size %zu by task %s/%d\n",
|
const char *get_wild_bug_type(struct kasan_access_info *info)
|
||||||
info->is_write ? "Write" : "Read",
|
{
|
||||||
info->access_size, current->comm, task_pid_nr(current));
|
const char *bug_type = "unknown-crash";
|
||||||
|
|
||||||
|
if ((unsigned long)info->access_addr < PAGE_SIZE)
|
||||||
|
bug_type = "null-ptr-deref";
|
||||||
|
else if ((unsigned long)info->access_addr < TASK_SIZE)
|
||||||
|
bug_type = "user-memory-access";
|
||||||
|
else
|
||||||
|
bug_type = "wild-memory-access";
|
||||||
|
|
||||||
|
return bug_type;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char *get_bug_type(struct kasan_access_info *info)
|
||||||
|
{
|
||||||
|
if (addr_has_shadow(info))
|
||||||
|
return get_shadow_bug_type(info);
|
||||||
|
return get_wild_bug_type(info);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_error_description(struct kasan_access_info *info)
|
||||||
|
{
|
||||||
|
const char *bug_type = get_bug_type(info);
|
||||||
|
|
||||||
|
pr_err("BUG: KASAN: %s in %pS\n",
|
||||||
|
bug_type, (void *)info->ip);
|
||||||
|
pr_err("%s of size %zu at addr %p by task %s/%d\n",
|
||||||
|
info->is_write ? "Write" : "Read", info->access_size,
|
||||||
|
info->access_addr, current->comm, task_pid_nr(current));
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool kernel_or_module_addr(const void *addr)
|
static inline bool kernel_or_module_addr(const void *addr)
|
||||||
|
@ -144,9 +177,9 @@ static void kasan_end_report(unsigned long *flags)
|
||||||
kasan_enable_current();
|
kasan_enable_current();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void print_track(struct kasan_track *track)
|
static void print_track(struct kasan_track *track, const char *prefix)
|
||||||
{
|
{
|
||||||
pr_err("PID = %u\n", track->pid);
|
pr_err("%s by task %u:\n", prefix, track->pid);
|
||||||
if (track->stack) {
|
if (track->stack) {
|
||||||
struct stack_trace trace;
|
struct stack_trace trace;
|
||||||
|
|
||||||
|
@ -157,59 +190,84 @@ static void print_track(struct kasan_track *track)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void kasan_object_err(struct kmem_cache *cache, void *object)
|
static struct page *addr_to_page(const void *addr)
|
||||||
|
{
|
||||||
|
if ((addr >= (void *)PAGE_OFFSET) &&
|
||||||
|
(addr < high_memory))
|
||||||
|
return virt_to_head_page(addr);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void describe_object_addr(struct kmem_cache *cache, void *object,
|
||||||
|
const void *addr)
|
||||||
|
{
|
||||||
|
unsigned long access_addr = (unsigned long)addr;
|
||||||
|
unsigned long object_addr = (unsigned long)object;
|
||||||
|
const char *rel_type;
|
||||||
|
int rel_bytes;
|
||||||
|
|
||||||
|
pr_err("The buggy address belongs to the object at %p\n"
|
||||||
|
" which belongs to the cache %s of size %d\n",
|
||||||
|
object, cache->name, cache->object_size);
|
||||||
|
|
||||||
|
if (!addr)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (access_addr < object_addr) {
|
||||||
|
rel_type = "to the left";
|
||||||
|
rel_bytes = object_addr - access_addr;
|
||||||
|
} else if (access_addr >= object_addr + cache->object_size) {
|
||||||
|
rel_type = "to the right";
|
||||||
|
rel_bytes = access_addr - (object_addr + cache->object_size);
|
||||||
|
} else {
|
||||||
|
rel_type = "inside";
|
||||||
|
rel_bytes = access_addr - object_addr;
|
||||||
|
}
|
||||||
|
|
||||||
|
pr_err("The buggy address is located %d bytes %s of\n"
|
||||||
|
" %d-byte region [%p, %p)\n",
|
||||||
|
rel_bytes, rel_type, cache->object_size, (void *)object_addr,
|
||||||
|
(void *)(object_addr + cache->object_size));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void describe_object(struct kmem_cache *cache, void *object,
|
||||||
|
const void *addr)
|
||||||
{
|
{
|
||||||
struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
|
struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
|
||||||
|
|
||||||
|
if (cache->flags & SLAB_KASAN) {
|
||||||
|
print_track(&alloc_info->alloc_track, "Allocated");
|
||||||
|
pr_err("\n");
|
||||||
|
print_track(&alloc_info->free_track, "Freed");
|
||||||
|
pr_err("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
describe_object_addr(cache, object, addr);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_address_description(void *addr)
|
||||||
|
{
|
||||||
|
struct page *page = addr_to_page(addr);
|
||||||
|
|
||||||
dump_stack();
|
dump_stack();
|
||||||
pr_err("Object at %p, in cache %s size: %d\n", object, cache->name,
|
pr_err("\n");
|
||||||
cache->object_size);
|
|
||||||
|
|
||||||
if (!(cache->flags & SLAB_KASAN))
|
if (page && PageSlab(page)) {
|
||||||
return;
|
struct kmem_cache *cache = page->slab_cache;
|
||||||
|
void *object = nearest_obj(cache, page, addr);
|
||||||
|
|
||||||
pr_err("Allocated:\n");
|
describe_object(cache, object, addr);
|
||||||
print_track(&alloc_info->alloc_track);
|
}
|
||||||
pr_err("Freed:\n");
|
|
||||||
print_track(&alloc_info->free_track);
|
|
||||||
}
|
|
||||||
|
|
||||||
void kasan_report_double_free(struct kmem_cache *cache, void *object,
|
if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
|
||||||
s8 shadow)
|
pr_err("The buggy address belongs to the variable:\n");
|
||||||
{
|
pr_err(" %pS\n", addr);
|
||||||
unsigned long flags;
|
}
|
||||||
|
|
||||||
kasan_start_report(&flags);
|
if (page) {
|
||||||
pr_err("BUG: Double free or freeing an invalid pointer\n");
|
pr_err("The buggy address belongs to the page:\n");
|
||||||
pr_err("Unexpected shadow byte: 0x%hhX\n", shadow);
|
|
||||||
kasan_object_err(cache, object);
|
|
||||||
kasan_end_report(&flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void print_address_description(struct kasan_access_info *info)
|
|
||||||
{
|
|
||||||
const void *addr = info->access_addr;
|
|
||||||
|
|
||||||
if ((addr >= (void *)PAGE_OFFSET) &&
|
|
||||||
(addr < high_memory)) {
|
|
||||||
struct page *page = virt_to_head_page(addr);
|
|
||||||
|
|
||||||
if (PageSlab(page)) {
|
|
||||||
void *object;
|
|
||||||
struct kmem_cache *cache = page->slab_cache;
|
|
||||||
object = nearest_obj(cache, page,
|
|
||||||
(void *)info->access_addr);
|
|
||||||
kasan_object_err(cache, object);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
dump_page(page, "kasan: bad access detected");
|
dump_page(page, "kasan: bad access detected");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (kernel_or_module_addr(addr)) {
|
|
||||||
if (!init_task_stack_addr(addr))
|
|
||||||
pr_err("Address belongs to variable %pS\n", addr);
|
|
||||||
}
|
|
||||||
dump_stack();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool row_is_guilty(const void *row, const void *guilty)
|
static bool row_is_guilty(const void *row, const void *guilty)
|
||||||
|
@ -264,31 +322,34 @@ static void print_shadow_for_address(const void *addr)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void kasan_report_double_free(struct kmem_cache *cache, void *object,
|
||||||
|
void *ip)
|
||||||
|
{
|
||||||
|
unsigned long flags;
|
||||||
|
|
||||||
|
kasan_start_report(&flags);
|
||||||
|
pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip);
|
||||||
|
pr_err("\n");
|
||||||
|
print_address_description(object);
|
||||||
|
pr_err("\n");
|
||||||
|
print_shadow_for_address(object);
|
||||||
|
kasan_end_report(&flags);
|
||||||
|
}
|
||||||
|
|
||||||
static void kasan_report_error(struct kasan_access_info *info)
|
static void kasan_report_error(struct kasan_access_info *info)
|
||||||
{
|
{
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
const char *bug_type;
|
|
||||||
|
|
||||||
kasan_start_report(&flags);
|
kasan_start_report(&flags);
|
||||||
|
|
||||||
if (info->access_addr <
|
print_error_description(info);
|
||||||
kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
|
pr_err("\n");
|
||||||
if ((unsigned long)info->access_addr < PAGE_SIZE)
|
|
||||||
bug_type = "null-ptr-deref";
|
if (!addr_has_shadow(info)) {
|
||||||
else if ((unsigned long)info->access_addr < TASK_SIZE)
|
|
||||||
bug_type = "user-memory-access";
|
|
||||||
else
|
|
||||||
bug_type = "wild-memory-access";
|
|
||||||
pr_err("BUG: KASAN: %s on address %p\n",
|
|
||||||
bug_type, info->access_addr);
|
|
||||||
pr_err("%s of size %zu by task %s/%d\n",
|
|
||||||
info->is_write ? "Write" : "Read",
|
|
||||||
info->access_size, current->comm,
|
|
||||||
task_pid_nr(current));
|
|
||||||
dump_stack();
|
dump_stack();
|
||||||
} else {
|
} else {
|
||||||
print_error_description(info);
|
print_address_description((void *)info->access_addr);
|
||||||
print_address_description(info);
|
pr_err("\n");
|
||||||
print_shadow_for_address(info->first_bad_addr);
|
print_shadow_for_address(info->first_bad_addr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -483,8 +483,7 @@ void __khugepaged_exit(struct mm_struct *mm)
|
||||||
|
|
||||||
static void release_pte_page(struct page *page)
|
static void release_pte_page(struct page *page)
|
||||||
{
|
{
|
||||||
/* 0 stands for page_is_file_cache(page) == false */
|
dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
|
||||||
dec_node_page_state(page, NR_ISOLATED_ANON + 0);
|
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
putback_lru_page(page);
|
putback_lru_page(page);
|
||||||
}
|
}
|
||||||
|
@ -532,7 +531,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
|
||||||
|
|
||||||
VM_BUG_ON_PAGE(PageCompound(page), page);
|
VM_BUG_ON_PAGE(PageCompound(page), page);
|
||||||
VM_BUG_ON_PAGE(!PageAnon(page), page);
|
VM_BUG_ON_PAGE(!PageAnon(page), page);
|
||||||
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We can do it before isolate_lru_page because the
|
* We can do it before isolate_lru_page because the
|
||||||
|
@ -550,7 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
|
||||||
* The page must only be referenced by the scanned process
|
* The page must only be referenced by the scanned process
|
||||||
* and page swap cache.
|
* and page swap cache.
|
||||||
*/
|
*/
|
||||||
if (page_count(page) != 1 + !!PageSwapCache(page)) {
|
if (page_count(page) != 1 + PageSwapCache(page)) {
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
result = SCAN_PAGE_COUNT;
|
result = SCAN_PAGE_COUNT;
|
||||||
goto out;
|
goto out;
|
||||||
|
@ -579,8 +577,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
|
||||||
result = SCAN_DEL_PAGE_LRU;
|
result = SCAN_DEL_PAGE_LRU;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
/* 0 stands for page_is_file_cache(page) == false */
|
inc_node_page_state(page,
|
||||||
inc_node_page_state(page, NR_ISOLATED_ANON + 0);
|
NR_ISOLATED_ANON + page_is_file_cache(page));
|
||||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||||
|
|
||||||
|
@ -1183,7 +1181,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
|
||||||
* The page must only be referenced by the scanned process
|
* The page must only be referenced by the scanned process
|
||||||
* and page swap cache.
|
* and page swap cache.
|
||||||
*/
|
*/
|
||||||
if (page_count(page) != 1 + !!PageSwapCache(page)) {
|
if (page_count(page) != 1 + PageSwapCache(page)) {
|
||||||
result = SCAN_PAGE_COUNT;
|
result = SCAN_PAGE_COUNT;
|
||||||
goto out_unmap;
|
goto out_unmap;
|
||||||
}
|
}
|
||||||
|
|
16
mm/ksm.c
16
mm/ksm.c
|
@ -1933,11 +1933,10 @@ struct page *ksm_might_need_to_copy(struct page *page,
|
||||||
return new_page;
|
return new_page;
|
||||||
}
|
}
|
||||||
|
|
||||||
int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
|
void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
|
||||||
{
|
{
|
||||||
struct stable_node *stable_node;
|
struct stable_node *stable_node;
|
||||||
struct rmap_item *rmap_item;
|
struct rmap_item *rmap_item;
|
||||||
int ret = SWAP_AGAIN;
|
|
||||||
int search_new_forks = 0;
|
int search_new_forks = 0;
|
||||||
|
|
||||||
VM_BUG_ON_PAGE(!PageKsm(page), page);
|
VM_BUG_ON_PAGE(!PageKsm(page), page);
|
||||||
|
@ -1950,7 +1949,7 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
|
||||||
|
|
||||||
stable_node = page_stable_node(page);
|
stable_node = page_stable_node(page);
|
||||||
if (!stable_node)
|
if (!stable_node)
|
||||||
return ret;
|
return;
|
||||||
again:
|
again:
|
||||||
hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
|
hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
|
||||||
struct anon_vma *anon_vma = rmap_item->anon_vma;
|
struct anon_vma *anon_vma = rmap_item->anon_vma;
|
||||||
|
@ -1978,23 +1977,20 @@ again:
|
||||||
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
|
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
ret = rwc->rmap_one(page, vma,
|
if (!rwc->rmap_one(page, vma,
|
||||||
rmap_item->address, rwc->arg);
|
rmap_item->address, rwc->arg)) {
|
||||||
if (ret != SWAP_AGAIN) {
|
|
||||||
anon_vma_unlock_read(anon_vma);
|
anon_vma_unlock_read(anon_vma);
|
||||||
goto out;
|
return;
|
||||||
}
|
}
|
||||||
if (rwc->done && rwc->done(page)) {
|
if (rwc->done && rwc->done(page)) {
|
||||||
anon_vma_unlock_read(anon_vma);
|
anon_vma_unlock_read(anon_vma);
|
||||||
goto out;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
anon_vma_unlock_read(anon_vma);
|
anon_vma_unlock_read(anon_vma);
|
||||||
}
|
}
|
||||||
if (!search_new_forks++)
|
if (!search_new_forks++)
|
||||||
goto again;
|
goto again;
|
||||||
out:
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_MIGRATION
|
#ifdef CONFIG_MIGRATION
|
||||||
|
|
56
mm/madvise.c
56
mm/madvise.c
|
@ -411,10 +411,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
|
||||||
ptent = pte_mkold(ptent);
|
ptent = pte_mkold(ptent);
|
||||||
ptent = pte_mkclean(ptent);
|
ptent = pte_mkclean(ptent);
|
||||||
set_pte_at(mm, addr, pte, ptent);
|
set_pte_at(mm, addr, pte, ptent);
|
||||||
if (PageActive(page))
|
|
||||||
deactivate_page(page);
|
|
||||||
tlb_remove_tlb_entry(tlb, pte, addr);
|
tlb_remove_tlb_entry(tlb, pte, addr);
|
||||||
}
|
}
|
||||||
|
mark_page_lazyfree(page);
|
||||||
}
|
}
|
||||||
out:
|
out:
|
||||||
if (nr_swap) {
|
if (nr_swap) {
|
||||||
|
@ -606,34 +605,40 @@ static long madvise_remove(struct vm_area_struct *vma,
|
||||||
/*
|
/*
|
||||||
* Error injection support for memory error handling.
|
* Error injection support for memory error handling.
|
||||||
*/
|
*/
|
||||||
static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
|
static int madvise_inject_error(int behavior,
|
||||||
|
unsigned long start, unsigned long end)
|
||||||
{
|
{
|
||||||
struct page *p;
|
struct page *page;
|
||||||
|
|
||||||
if (!capable(CAP_SYS_ADMIN))
|
if (!capable(CAP_SYS_ADMIN))
|
||||||
return -EPERM;
|
return -EPERM;
|
||||||
|
|
||||||
for (; start < end; start += PAGE_SIZE <<
|
for (; start < end; start += PAGE_SIZE <<
|
||||||
compound_order(compound_head(p))) {
|
compound_order(compound_head(page))) {
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
ret = get_user_pages_fast(start, 1, 0, &p);
|
ret = get_user_pages_fast(start, 1, 0, &page);
|
||||||
if (ret != 1)
|
if (ret != 1)
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
if (PageHWPoison(p)) {
|
if (PageHWPoison(page)) {
|
||||||
put_page(p);
|
put_page(page);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (bhv == MADV_SOFT_OFFLINE) {
|
|
||||||
pr_info("Soft offlining page %#lx at %#lx\n",
|
if (behavior == MADV_SOFT_OFFLINE) {
|
||||||
page_to_pfn(p), start);
|
pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
|
||||||
ret = soft_offline_page(p, MF_COUNT_INCREASED);
|
page_to_pfn(page), start);
|
||||||
|
|
||||||
|
ret = soft_offline_page(page, MF_COUNT_INCREASED);
|
||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
return ret;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
pr_info("Injecting memory failure for page %#lx at %#lx\n",
|
pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
|
||||||
page_to_pfn(p), start);
|
page_to_pfn(page), start);
|
||||||
ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
|
|
||||||
|
ret = memory_failure(page_to_pfn(page), 0, MF_COUNT_INCREASED);
|
||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -651,13 +656,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
|
||||||
case MADV_WILLNEED:
|
case MADV_WILLNEED:
|
||||||
return madvise_willneed(vma, prev, start, end);
|
return madvise_willneed(vma, prev, start, end);
|
||||||
case MADV_FREE:
|
case MADV_FREE:
|
||||||
/*
|
return madvise_free(vma, prev, start, end);
|
||||||
* XXX: In this implementation, MADV_FREE works like
|
|
||||||
* MADV_DONTNEED on swapless system or full swap.
|
|
||||||
*/
|
|
||||||
if (get_nr_swap_pages() > 0)
|
|
||||||
return madvise_free(vma, prev, start, end);
|
|
||||||
/* passthrough */
|
|
||||||
case MADV_DONTNEED:
|
case MADV_DONTNEED:
|
||||||
return madvise_dontneed(vma, prev, start, end);
|
return madvise_dontneed(vma, prev, start, end);
|
||||||
default:
|
default:
|
||||||
|
@ -688,6 +687,10 @@ madvise_behavior_valid(int behavior)
|
||||||
#endif
|
#endif
|
||||||
case MADV_DONTDUMP:
|
case MADV_DONTDUMP:
|
||||||
case MADV_DODUMP:
|
case MADV_DODUMP:
|
||||||
|
#ifdef CONFIG_MEMORY_FAILURE
|
||||||
|
case MADV_SOFT_OFFLINE:
|
||||||
|
case MADV_HWPOISON:
|
||||||
|
#endif
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
@ -761,10 +764,6 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
|
||||||
size_t len;
|
size_t len;
|
||||||
struct blk_plug plug;
|
struct blk_plug plug;
|
||||||
|
|
||||||
#ifdef CONFIG_MEMORY_FAILURE
|
|
||||||
if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
|
|
||||||
return madvise_hwpoison(behavior, start, start+len_in);
|
|
||||||
#endif
|
|
||||||
if (!madvise_behavior_valid(behavior))
|
if (!madvise_behavior_valid(behavior))
|
||||||
return error;
|
return error;
|
||||||
|
|
||||||
|
@ -784,6 +783,11 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
|
||||||
if (end == start)
|
if (end == start)
|
||||||
return error;
|
return error;
|
||||||
|
|
||||||
|
#ifdef CONFIG_MEMORY_FAILURE
|
||||||
|
if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
|
||||||
|
return madvise_inject_error(behavior, start, start + len_in);
|
||||||
|
#endif
|
||||||
|
|
||||||
write = madvise_need_mmap_write(behavior);
|
write = madvise_need_mmap_write(behavior);
|
||||||
if (write) {
|
if (write) {
|
||||||
if (down_write_killable(¤t->mm->mmap_sem))
|
if (down_write_killable(¤t->mm->mmap_sem))
|
||||||
|
|
248
mm/memcontrol.c
248
mm/memcontrol.c
|
@ -100,24 +100,7 @@ static bool do_memsw_account(void)
|
||||||
return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
|
return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * const mem_cgroup_stat_names[] = {
|
static const char *const mem_cgroup_lru_names[] = {
|
||||||
"cache",
|
|
||||||
"rss",
|
|
||||||
"rss_huge",
|
|
||||||
"mapped_file",
|
|
||||||
"dirty",
|
|
||||||
"writeback",
|
|
||||||
"swap",
|
|
||||||
};
|
|
||||||
|
|
||||||
static const char * const mem_cgroup_events_names[] = {
|
|
||||||
"pgpgin",
|
|
||||||
"pgpgout",
|
|
||||||
"pgfault",
|
|
||||||
"pgmajfault",
|
|
||||||
};
|
|
||||||
|
|
||||||
static const char * const mem_cgroup_lru_names[] = {
|
|
||||||
"inactive_anon",
|
"inactive_anon",
|
||||||
"active_anon",
|
"active_anon",
|
||||||
"inactive_file",
|
"inactive_file",
|
||||||
|
@ -568,32 +551,15 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
|
||||||
* common workload, threshold and synchronization as vmstat[] should be
|
* common workload, threshold and synchronization as vmstat[] should be
|
||||||
* implemented.
|
* implemented.
|
||||||
*/
|
*/
|
||||||
static unsigned long
|
|
||||||
mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
|
|
||||||
{
|
|
||||||
long val = 0;
|
|
||||||
int cpu;
|
|
||||||
|
|
||||||
/* Per-cpu values can be negative, use a signed accumulator */
|
static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
|
||||||
for_each_possible_cpu(cpu)
|
enum memcg_event_item event)
|
||||||
val += per_cpu(memcg->stat->count[idx], cpu);
|
|
||||||
/*
|
|
||||||
* Summing races with updates, so val may be negative. Avoid exposing
|
|
||||||
* transient negative values.
|
|
||||||
*/
|
|
||||||
if (val < 0)
|
|
||||||
val = 0;
|
|
||||||
return val;
|
|
||||||
}
|
|
||||||
|
|
||||||
static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
|
|
||||||
enum mem_cgroup_events_index idx)
|
|
||||||
{
|
{
|
||||||
unsigned long val = 0;
|
unsigned long val = 0;
|
||||||
int cpu;
|
int cpu;
|
||||||
|
|
||||||
for_each_possible_cpu(cpu)
|
for_each_possible_cpu(cpu)
|
||||||
val += per_cpu(memcg->stat->events[idx], cpu);
|
val += per_cpu(memcg->stat->events[event], cpu);
|
||||||
return val;
|
return val;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -606,23 +572,23 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
|
||||||
* counted as CACHE even if it's on ANON LRU.
|
* counted as CACHE even if it's on ANON LRU.
|
||||||
*/
|
*/
|
||||||
if (PageAnon(page))
|
if (PageAnon(page))
|
||||||
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
|
__this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages);
|
||||||
nr_pages);
|
else {
|
||||||
else
|
__this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages);
|
||||||
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
|
if (PageSwapBacked(page))
|
||||||
nr_pages);
|
__this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages);
|
||||||
|
}
|
||||||
|
|
||||||
if (compound) {
|
if (compound) {
|
||||||
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
|
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
|
||||||
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
|
__this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages);
|
||||||
nr_pages);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* pagein of a big page is an event. So, ignore page size */
|
/* pagein of a big page is an event. So, ignore page size */
|
||||||
if (nr_pages > 0)
|
if (nr_pages > 0)
|
||||||
__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
|
__this_cpu_inc(memcg->stat->events[PGPGIN]);
|
||||||
else {
|
else {
|
||||||
__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
|
__this_cpu_inc(memcg->stat->events[PGPGOUT]);
|
||||||
nr_pages = -nr_pages; /* for event */
|
nr_pages = -nr_pages; /* for event */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1144,6 +1110,28 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned int memcg1_stats[] = {
|
||||||
|
MEMCG_CACHE,
|
||||||
|
MEMCG_RSS,
|
||||||
|
MEMCG_RSS_HUGE,
|
||||||
|
NR_SHMEM,
|
||||||
|
NR_FILE_MAPPED,
|
||||||
|
NR_FILE_DIRTY,
|
||||||
|
NR_WRITEBACK,
|
||||||
|
MEMCG_SWAP,
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char *const memcg1_stat_names[] = {
|
||||||
|
"cache",
|
||||||
|
"rss",
|
||||||
|
"rss_huge",
|
||||||
|
"shmem",
|
||||||
|
"mapped_file",
|
||||||
|
"dirty",
|
||||||
|
"writeback",
|
||||||
|
"swap",
|
||||||
|
};
|
||||||
|
|
||||||
#define K(x) ((x) << (PAGE_SHIFT-10))
|
#define K(x) ((x) << (PAGE_SHIFT-10))
|
||||||
/**
|
/**
|
||||||
* mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
|
* mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
|
||||||
|
@ -1188,11 +1176,11 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
|
||||||
pr_cont_cgroup_path(iter->css.cgroup);
|
pr_cont_cgroup_path(iter->css.cgroup);
|
||||||
pr_cont(":");
|
pr_cont(":");
|
||||||
|
|
||||||
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
|
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
|
||||||
if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
|
if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
|
||||||
continue;
|
continue;
|
||||||
pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
|
pr_cont(" %s:%luKB", memcg1_stat_names[i],
|
||||||
K(mem_cgroup_read_stat(iter, i)));
|
K(memcg_page_state(iter, memcg1_stats[i])));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < NR_LRU_LISTS; i++)
|
for (i = 0; i < NR_LRU_LISTS; i++)
|
||||||
|
@ -1837,7 +1825,7 @@ static void reclaim_high(struct mem_cgroup *memcg,
|
||||||
do {
|
do {
|
||||||
if (page_counter_read(&memcg->memory) <= memcg->high)
|
if (page_counter_read(&memcg->memory) <= memcg->high)
|
||||||
continue;
|
continue;
|
||||||
mem_cgroup_events(memcg, MEMCG_HIGH, 1);
|
mem_cgroup_event(memcg, MEMCG_HIGH);
|
||||||
try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
|
try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
|
||||||
} while ((memcg = parent_mem_cgroup(memcg)));
|
} while ((memcg = parent_mem_cgroup(memcg)));
|
||||||
}
|
}
|
||||||
|
@ -1928,7 +1916,7 @@ retry:
|
||||||
if (!gfpflags_allow_blocking(gfp_mask))
|
if (!gfpflags_allow_blocking(gfp_mask))
|
||||||
goto nomem;
|
goto nomem;
|
||||||
|
|
||||||
mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
|
mem_cgroup_event(mem_over_limit, MEMCG_MAX);
|
||||||
|
|
||||||
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
|
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
|
||||||
gfp_mask, may_swap);
|
gfp_mask, may_swap);
|
||||||
|
@ -1971,7 +1959,7 @@ retry:
|
||||||
if (fatal_signal_pending(current))
|
if (fatal_signal_pending(current))
|
||||||
goto force;
|
goto force;
|
||||||
|
|
||||||
mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
|
mem_cgroup_event(mem_over_limit, MEMCG_OOM);
|
||||||
|
|
||||||
mem_cgroup_oom(mem_over_limit, gfp_mask,
|
mem_cgroup_oom(mem_over_limit, gfp_mask,
|
||||||
get_order(nr_pages * PAGE_SIZE));
|
get_order(nr_pages * PAGE_SIZE));
|
||||||
|
@ -2381,7 +2369,7 @@ void mem_cgroup_split_huge_fixup(struct page *head)
|
||||||
for (i = 1; i < HPAGE_PMD_NR; i++)
|
for (i = 1; i < HPAGE_PMD_NR; i++)
|
||||||
head[i].mem_cgroup = head->mem_cgroup;
|
head[i].mem_cgroup = head->mem_cgroup;
|
||||||
|
|
||||||
__this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
|
__this_cpu_sub(head->mem_cgroup->stat->count[MEMCG_RSS_HUGE],
|
||||||
HPAGE_PMD_NR);
|
HPAGE_PMD_NR);
|
||||||
}
|
}
|
||||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||||
|
@ -2391,7 +2379,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
|
||||||
bool charge)
|
bool charge)
|
||||||
{
|
{
|
||||||
int val = (charge) ? 1 : -1;
|
int val = (charge) ? 1 : -1;
|
||||||
this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
|
this_cpu_add(memcg->stat->count[MEMCG_SWAP], val);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -2725,7 +2713,7 @@ static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
|
||||||
|
|
||||||
for_each_mem_cgroup_tree(iter, memcg) {
|
for_each_mem_cgroup_tree(iter, memcg) {
|
||||||
for (i = 0; i < MEMCG_NR_STAT; i++)
|
for (i = 0; i < MEMCG_NR_STAT; i++)
|
||||||
stat[i] += mem_cgroup_read_stat(iter, i);
|
stat[i] += memcg_page_state(iter, i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2738,7 +2726,7 @@ static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
|
||||||
|
|
||||||
for_each_mem_cgroup_tree(iter, memcg) {
|
for_each_mem_cgroup_tree(iter, memcg) {
|
||||||
for (i = 0; i < MEMCG_NR_EVENTS; i++)
|
for (i = 0; i < MEMCG_NR_EVENTS; i++)
|
||||||
events[i] += mem_cgroup_read_events(iter, i);
|
events[i] += memcg_sum_events(iter, i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2750,13 +2738,10 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
|
||||||
struct mem_cgroup *iter;
|
struct mem_cgroup *iter;
|
||||||
|
|
||||||
for_each_mem_cgroup_tree(iter, memcg) {
|
for_each_mem_cgroup_tree(iter, memcg) {
|
||||||
val += mem_cgroup_read_stat(iter,
|
val += memcg_page_state(iter, MEMCG_CACHE);
|
||||||
MEM_CGROUP_STAT_CACHE);
|
val += memcg_page_state(iter, MEMCG_RSS);
|
||||||
val += mem_cgroup_read_stat(iter,
|
|
||||||
MEM_CGROUP_STAT_RSS);
|
|
||||||
if (swap)
|
if (swap)
|
||||||
val += mem_cgroup_read_stat(iter,
|
val += memcg_page_state(iter, MEMCG_SWAP);
|
||||||
MEM_CGROUP_STAT_SWAP);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (!swap)
|
if (!swap)
|
||||||
|
@ -3131,6 +3116,21 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
|
||||||
}
|
}
|
||||||
#endif /* CONFIG_NUMA */
|
#endif /* CONFIG_NUMA */
|
||||||
|
|
||||||
|
/* Universal VM events cgroup1 shows, original sort order */
|
||||||
|
unsigned int memcg1_events[] = {
|
||||||
|
PGPGIN,
|
||||||
|
PGPGOUT,
|
||||||
|
PGFAULT,
|
||||||
|
PGMAJFAULT,
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char *const memcg1_event_names[] = {
|
||||||
|
"pgpgin",
|
||||||
|
"pgpgout",
|
||||||
|
"pgfault",
|
||||||
|
"pgmajfault",
|
||||||
|
};
|
||||||
|
|
||||||
static int memcg_stat_show(struct seq_file *m, void *v)
|
static int memcg_stat_show(struct seq_file *m, void *v)
|
||||||
{
|
{
|
||||||
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
|
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
|
||||||
|
@ -3138,22 +3138,20 @@ static int memcg_stat_show(struct seq_file *m, void *v)
|
||||||
struct mem_cgroup *mi;
|
struct mem_cgroup *mi;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
|
|
||||||
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
|
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
|
||||||
MEM_CGROUP_STAT_NSTATS);
|
|
||||||
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
|
|
||||||
MEM_CGROUP_EVENTS_NSTATS);
|
|
||||||
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
|
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
|
||||||
|
|
||||||
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
|
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
|
||||||
if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
|
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
|
||||||
continue;
|
continue;
|
||||||
seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
|
seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
|
||||||
mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
|
memcg_page_state(memcg, memcg1_stats[i]) *
|
||||||
|
PAGE_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
|
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
|
||||||
seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
|
seq_printf(m, "%s %lu\n", memcg1_event_names[i],
|
||||||
mem_cgroup_read_events(memcg, i));
|
memcg_sum_events(memcg, memcg1_events[i]));
|
||||||
|
|
||||||
for (i = 0; i < NR_LRU_LISTS; i++)
|
for (i = 0; i < NR_LRU_LISTS; i++)
|
||||||
seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
|
seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
|
||||||
|
@ -3171,23 +3169,23 @@ static int memcg_stat_show(struct seq_file *m, void *v)
|
||||||
seq_printf(m, "hierarchical_memsw_limit %llu\n",
|
seq_printf(m, "hierarchical_memsw_limit %llu\n",
|
||||||
(u64)memsw * PAGE_SIZE);
|
(u64)memsw * PAGE_SIZE);
|
||||||
|
|
||||||
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
|
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
|
||||||
unsigned long long val = 0;
|
unsigned long long val = 0;
|
||||||
|
|
||||||
if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
|
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
|
||||||
continue;
|
continue;
|
||||||
for_each_mem_cgroup_tree(mi, memcg)
|
for_each_mem_cgroup_tree(mi, memcg)
|
||||||
val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
|
val += memcg_page_state(mi, memcg1_stats[i]) *
|
||||||
seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
|
PAGE_SIZE;
|
||||||
|
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
|
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) {
|
||||||
unsigned long long val = 0;
|
unsigned long long val = 0;
|
||||||
|
|
||||||
for_each_mem_cgroup_tree(mi, memcg)
|
for_each_mem_cgroup_tree(mi, memcg)
|
||||||
val += mem_cgroup_read_events(mi, i);
|
val += memcg_sum_events(mi, memcg1_events[i]);
|
||||||
seq_printf(m, "total_%s %llu\n",
|
seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val);
|
||||||
mem_cgroup_events_names[i], val);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < NR_LRU_LISTS; i++) {
|
for (i = 0; i < NR_LRU_LISTS; i++) {
|
||||||
|
@ -3652,10 +3650,10 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
|
||||||
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
|
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
|
||||||
struct mem_cgroup *parent;
|
struct mem_cgroup *parent;
|
||||||
|
|
||||||
*pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
|
*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
|
||||||
|
|
||||||
/* this should eventually include NR_UNSTABLE_NFS */
|
/* this should eventually include NR_UNSTABLE_NFS */
|
||||||
*pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
|
*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
|
||||||
*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
|
*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
|
||||||
(1 << LRU_ACTIVE_FILE));
|
(1 << LRU_ACTIVE_FILE));
|
||||||
*pheadroom = PAGE_COUNTER_MAX;
|
*pheadroom = PAGE_COUNTER_MAX;
|
||||||
|
@ -4511,33 +4509,29 @@ static int mem_cgroup_move_account(struct page *page,
|
||||||
spin_lock_irqsave(&from->move_lock, flags);
|
spin_lock_irqsave(&from->move_lock, flags);
|
||||||
|
|
||||||
if (!anon && page_mapped(page)) {
|
if (!anon && page_mapped(page)) {
|
||||||
__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
|
__this_cpu_sub(from->stat->count[NR_FILE_MAPPED], nr_pages);
|
||||||
nr_pages);
|
__this_cpu_add(to->stat->count[NR_FILE_MAPPED], nr_pages);
|
||||||
__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
|
|
||||||
nr_pages);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* move_lock grabbed above and caller set from->moving_account, so
|
* move_lock grabbed above and caller set from->moving_account, so
|
||||||
* mem_cgroup_update_page_stat() will serialize updates to PageDirty.
|
* mod_memcg_page_state will serialize updates to PageDirty.
|
||||||
* So mapping should be stable for dirty pages.
|
* So mapping should be stable for dirty pages.
|
||||||
*/
|
*/
|
||||||
if (!anon && PageDirty(page)) {
|
if (!anon && PageDirty(page)) {
|
||||||
struct address_space *mapping = page_mapping(page);
|
struct address_space *mapping = page_mapping(page);
|
||||||
|
|
||||||
if (mapping_cap_account_dirty(mapping)) {
|
if (mapping_cap_account_dirty(mapping)) {
|
||||||
__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
|
__this_cpu_sub(from->stat->count[NR_FILE_DIRTY],
|
||||||
nr_pages);
|
nr_pages);
|
||||||
__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
|
__this_cpu_add(to->stat->count[NR_FILE_DIRTY],
|
||||||
nr_pages);
|
nr_pages);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (PageWriteback(page)) {
|
if (PageWriteback(page)) {
|
||||||
__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
|
__this_cpu_sub(from->stat->count[NR_WRITEBACK], nr_pages);
|
||||||
nr_pages);
|
__this_cpu_add(to->stat->count[NR_WRITEBACK], nr_pages);
|
||||||
__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
|
|
||||||
nr_pages);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -5154,7 +5148,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
mem_cgroup_events(memcg, MEMCG_OOM, 1);
|
mem_cgroup_event(memcg, MEMCG_OOM);
|
||||||
if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
|
if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -5167,10 +5161,10 @@ static int memory_events_show(struct seq_file *m, void *v)
|
||||||
{
|
{
|
||||||
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
|
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
|
||||||
|
|
||||||
seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
|
seq_printf(m, "low %lu\n", memcg_sum_events(memcg, MEMCG_LOW));
|
||||||
seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
|
seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH));
|
||||||
seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
|
seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX));
|
||||||
seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
|
seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM));
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -5197,9 +5191,9 @@ static int memory_stat_show(struct seq_file *m, void *v)
|
||||||
tree_events(memcg, events);
|
tree_events(memcg, events);
|
||||||
|
|
||||||
seq_printf(m, "anon %llu\n",
|
seq_printf(m, "anon %llu\n",
|
||||||
(u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE);
|
(u64)stat[MEMCG_RSS] * PAGE_SIZE);
|
||||||
seq_printf(m, "file %llu\n",
|
seq_printf(m, "file %llu\n",
|
||||||
(u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
|
(u64)stat[MEMCG_CACHE] * PAGE_SIZE);
|
||||||
seq_printf(m, "kernel_stack %llu\n",
|
seq_printf(m, "kernel_stack %llu\n",
|
||||||
(u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
|
(u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
|
||||||
seq_printf(m, "slab %llu\n",
|
seq_printf(m, "slab %llu\n",
|
||||||
|
@ -5208,12 +5202,14 @@ static int memory_stat_show(struct seq_file *m, void *v)
|
||||||
seq_printf(m, "sock %llu\n",
|
seq_printf(m, "sock %llu\n",
|
||||||
(u64)stat[MEMCG_SOCK] * PAGE_SIZE);
|
(u64)stat[MEMCG_SOCK] * PAGE_SIZE);
|
||||||
|
|
||||||
|
seq_printf(m, "shmem %llu\n",
|
||||||
|
(u64)stat[NR_SHMEM] * PAGE_SIZE);
|
||||||
seq_printf(m, "file_mapped %llu\n",
|
seq_printf(m, "file_mapped %llu\n",
|
||||||
(u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
|
(u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
|
||||||
seq_printf(m, "file_dirty %llu\n",
|
seq_printf(m, "file_dirty %llu\n",
|
||||||
(u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE);
|
(u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
|
||||||
seq_printf(m, "file_writeback %llu\n",
|
seq_printf(m, "file_writeback %llu\n",
|
||||||
(u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE);
|
(u64)stat[NR_WRITEBACK] * PAGE_SIZE);
|
||||||
|
|
||||||
for (i = 0; i < NR_LRU_LISTS; i++) {
|
for (i = 0; i < NR_LRU_LISTS; i++) {
|
||||||
struct mem_cgroup *mi;
|
struct mem_cgroup *mi;
|
||||||
|
@ -5232,10 +5228,15 @@ static int memory_stat_show(struct seq_file *m, void *v)
|
||||||
|
|
||||||
/* Accumulated memory events */
|
/* Accumulated memory events */
|
||||||
|
|
||||||
seq_printf(m, "pgfault %lu\n",
|
seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
|
||||||
events[MEM_CGROUP_EVENTS_PGFAULT]);
|
seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
|
||||||
seq_printf(m, "pgmajfault %lu\n",
|
|
||||||
events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
|
seq_printf(m, "workingset_refault %lu\n",
|
||||||
|
stat[WORKINGSET_REFAULT]);
|
||||||
|
seq_printf(m, "workingset_activate %lu\n",
|
||||||
|
stat[WORKINGSET_ACTIVATE]);
|
||||||
|
seq_printf(m, "workingset_nodereclaim %lu\n",
|
||||||
|
stat[WORKINGSET_NODERECLAIM]);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -5476,8 +5477,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
|
||||||
|
|
||||||
static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
|
static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
|
||||||
unsigned long nr_anon, unsigned long nr_file,
|
unsigned long nr_anon, unsigned long nr_file,
|
||||||
unsigned long nr_huge, unsigned long nr_kmem,
|
unsigned long nr_kmem, unsigned long nr_huge,
|
||||||
struct page *dummy_page)
|
unsigned long nr_shmem, struct page *dummy_page)
|
||||||
{
|
{
|
||||||
unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
|
unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
@ -5492,10 +5493,11 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
|
||||||
}
|
}
|
||||||
|
|
||||||
local_irq_save(flags);
|
local_irq_save(flags);
|
||||||
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
|
__this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon);
|
||||||
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
|
__this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file);
|
||||||
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
|
__this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge);
|
||||||
__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
|
__this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem);
|
||||||
|
__this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
|
||||||
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
|
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
|
||||||
memcg_check_events(memcg, dummy_page);
|
memcg_check_events(memcg, dummy_page);
|
||||||
local_irq_restore(flags);
|
local_irq_restore(flags);
|
||||||
|
@ -5507,6 +5509,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
|
||||||
static void uncharge_list(struct list_head *page_list)
|
static void uncharge_list(struct list_head *page_list)
|
||||||
{
|
{
|
||||||
struct mem_cgroup *memcg = NULL;
|
struct mem_cgroup *memcg = NULL;
|
||||||
|
unsigned long nr_shmem = 0;
|
||||||
unsigned long nr_anon = 0;
|
unsigned long nr_anon = 0;
|
||||||
unsigned long nr_file = 0;
|
unsigned long nr_file = 0;
|
||||||
unsigned long nr_huge = 0;
|
unsigned long nr_huge = 0;
|
||||||
|
@ -5539,9 +5542,9 @@ static void uncharge_list(struct list_head *page_list)
|
||||||
if (memcg != page->mem_cgroup) {
|
if (memcg != page->mem_cgroup) {
|
||||||
if (memcg) {
|
if (memcg) {
|
||||||
uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
|
uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
|
||||||
nr_huge, nr_kmem, page);
|
nr_kmem, nr_huge, nr_shmem, page);
|
||||||
pgpgout = nr_anon = nr_file =
|
pgpgout = nr_anon = nr_file = nr_kmem = 0;
|
||||||
nr_huge = nr_kmem = 0;
|
nr_huge = nr_shmem = 0;
|
||||||
}
|
}
|
||||||
memcg = page->mem_cgroup;
|
memcg = page->mem_cgroup;
|
||||||
}
|
}
|
||||||
|
@ -5555,8 +5558,11 @@ static void uncharge_list(struct list_head *page_list)
|
||||||
}
|
}
|
||||||
if (PageAnon(page))
|
if (PageAnon(page))
|
||||||
nr_anon += nr_pages;
|
nr_anon += nr_pages;
|
||||||
else
|
else {
|
||||||
nr_file += nr_pages;
|
nr_file += nr_pages;
|
||||||
|
if (PageSwapBacked(page))
|
||||||
|
nr_shmem += nr_pages;
|
||||||
|
}
|
||||||
pgpgout++;
|
pgpgout++;
|
||||||
} else {
|
} else {
|
||||||
nr_kmem += 1 << compound_order(page);
|
nr_kmem += 1 << compound_order(page);
|
||||||
|
@ -5568,7 +5574,7 @@ static void uncharge_list(struct list_head *page_list)
|
||||||
|
|
||||||
if (memcg)
|
if (memcg)
|
||||||
uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
|
uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
|
||||||
nr_huge, nr_kmem, page);
|
nr_kmem, nr_huge, nr_shmem, page);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -220,6 +220,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
|
||||||
*/
|
*/
|
||||||
void shake_page(struct page *p, int access)
|
void shake_page(struct page *p, int access)
|
||||||
{
|
{
|
||||||
|
if (PageHuge(p))
|
||||||
|
return;
|
||||||
|
|
||||||
if (!PageSlab(p)) {
|
if (!PageSlab(p)) {
|
||||||
lru_add_drain_all();
|
lru_add_drain_all();
|
||||||
if (PageLRU(p))
|
if (PageLRU(p))
|
||||||
|
@ -322,7 +325,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
|
||||||
* wrong earlier.
|
* wrong earlier.
|
||||||
*/
|
*/
|
||||||
static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
|
static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
|
||||||
int fail, struct page *page, unsigned long pfn,
|
bool fail, struct page *page, unsigned long pfn,
|
||||||
int flags)
|
int flags)
|
||||||
{
|
{
|
||||||
struct to_kill *tk, *next;
|
struct to_kill *tk, *next;
|
||||||
|
@ -904,35 +907,36 @@ EXPORT_SYMBOL_GPL(get_hwpoison_page);
|
||||||
* Do all that is necessary to remove user space mappings. Unmap
|
* Do all that is necessary to remove user space mappings. Unmap
|
||||||
* the pages and send SIGBUS to the processes if the data was dirty.
|
* the pages and send SIGBUS to the processes if the data was dirty.
|
||||||
*/
|
*/
|
||||||
static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
|
static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
|
||||||
int trapno, int flags, struct page **hpagep)
|
int trapno, int flags, struct page **hpagep)
|
||||||
{
|
{
|
||||||
enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
|
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
|
||||||
struct address_space *mapping;
|
struct address_space *mapping;
|
||||||
LIST_HEAD(tokill);
|
LIST_HEAD(tokill);
|
||||||
int ret;
|
bool unmap_success;
|
||||||
int kill = 1, forcekill;
|
int kill = 1, forcekill;
|
||||||
struct page *hpage = *hpagep;
|
struct page *hpage = *hpagep;
|
||||||
|
bool mlocked = PageMlocked(hpage);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Here we are interested only in user-mapped pages, so skip any
|
* Here we are interested only in user-mapped pages, so skip any
|
||||||
* other types of pages.
|
* other types of pages.
|
||||||
*/
|
*/
|
||||||
if (PageReserved(p) || PageSlab(p))
|
if (PageReserved(p) || PageSlab(p))
|
||||||
return SWAP_SUCCESS;
|
return true;
|
||||||
if (!(PageLRU(hpage) || PageHuge(p)))
|
if (!(PageLRU(hpage) || PageHuge(p)))
|
||||||
return SWAP_SUCCESS;
|
return true;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This check implies we don't kill processes if their pages
|
* This check implies we don't kill processes if their pages
|
||||||
* are in the swap cache early. Those are always late kills.
|
* are in the swap cache early. Those are always late kills.
|
||||||
*/
|
*/
|
||||||
if (!page_mapped(hpage))
|
if (!page_mapped(hpage))
|
||||||
return SWAP_SUCCESS;
|
return true;
|
||||||
|
|
||||||
if (PageKsm(p)) {
|
if (PageKsm(p)) {
|
||||||
pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
|
pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
|
||||||
return SWAP_FAIL;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (PageSwapCache(p)) {
|
if (PageSwapCache(p)) {
|
||||||
|
@ -971,11 +975,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
|
||||||
if (kill)
|
if (kill)
|
||||||
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
|
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
|
||||||
|
|
||||||
ret = try_to_unmap(hpage, ttu);
|
unmap_success = try_to_unmap(hpage, ttu);
|
||||||
if (ret != SWAP_SUCCESS)
|
if (!unmap_success)
|
||||||
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
|
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
|
||||||
pfn, page_mapcount(hpage));
|
pfn, page_mapcount(hpage));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* try_to_unmap() might put mlocked page in lru cache, so call
|
||||||
|
* shake_page() again to ensure that it's flushed.
|
||||||
|
*/
|
||||||
|
if (mlocked)
|
||||||
|
shake_page(hpage, 0);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Now that the dirty bit has been propagated to the
|
* Now that the dirty bit has been propagated to the
|
||||||
* struct page and all unmaps done we can decide if
|
* struct page and all unmaps done we can decide if
|
||||||
|
@ -987,10 +998,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
|
||||||
* any accesses to the poisoned memory.
|
* any accesses to the poisoned memory.
|
||||||
*/
|
*/
|
||||||
forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
|
forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
|
||||||
kill_procs(&tokill, forcekill, trapno,
|
kill_procs(&tokill, forcekill, trapno, !unmap_success, p, pfn, flags);
|
||||||
ret != SWAP_SUCCESS, p, pfn, flags);
|
|
||||||
|
|
||||||
return ret;
|
return unmap_success;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void set_page_hwpoison_huge_page(struct page *hpage)
|
static void set_page_hwpoison_huge_page(struct page *hpage)
|
||||||
|
@ -1138,22 +1148,14 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
||||||
* The check (unnecessarily) ignores LRU pages being isolated and
|
* The check (unnecessarily) ignores LRU pages being isolated and
|
||||||
* walked by the page reclaim code, however that's not a big loss.
|
* walked by the page reclaim code, however that's not a big loss.
|
||||||
*/
|
*/
|
||||||
if (!PageHuge(p)) {
|
shake_page(p, 0);
|
||||||
if (!PageLRU(p))
|
/* shake_page could have turned it free. */
|
||||||
shake_page(p, 0);
|
if (!PageLRU(p) && is_free_buddy_page(p)) {
|
||||||
if (!PageLRU(p)) {
|
if (flags & MF_COUNT_INCREASED)
|
||||||
/*
|
action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
|
||||||
* shake_page could have turned it free.
|
else
|
||||||
*/
|
action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
|
||||||
if (is_free_buddy_page(p)) {
|
return 0;
|
||||||
if (flags & MF_COUNT_INCREASED)
|
|
||||||
action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
|
|
||||||
else
|
|
||||||
action_result(pfn, MF_MSG_BUDDY_2ND,
|
|
||||||
MF_DELAYED);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
lock_page(hpage);
|
lock_page(hpage);
|
||||||
|
@ -1230,8 +1232,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
||||||
* When the raw error page is thp tail page, hpage points to the raw
|
* When the raw error page is thp tail page, hpage points to the raw
|
||||||
* page after thp split.
|
* page after thp split.
|
||||||
*/
|
*/
|
||||||
if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
|
if (!hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)) {
|
||||||
!= SWAP_SUCCESS) {
|
|
||||||
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
|
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
|
||||||
res = -EBUSY;
|
res = -EBUSY;
|
||||||
goto out;
|
goto out;
|
||||||
|
@ -1543,8 +1544,8 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
|
||||||
if (ret == 1 && !PageLRU(page)) {
|
if (ret == 1 && !PageLRU(page)) {
|
||||||
/* Drop page reference which is from __get_any_page() */
|
/* Drop page reference which is from __get_any_page() */
|
||||||
put_hwpoison_page(page);
|
put_hwpoison_page(page);
|
||||||
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
|
pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
|
||||||
pfn, page->flags);
|
pfn, page->flags, &page->flags);
|
||||||
return -EIO;
|
return -EIO;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1585,8 +1586,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
|
||||||
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
|
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
|
||||||
MIGRATE_SYNC, MR_MEMORY_FAILURE);
|
MIGRATE_SYNC, MR_MEMORY_FAILURE);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
|
pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
|
||||||
pfn, ret, page->flags);
|
pfn, ret, page->flags, &page->flags);
|
||||||
/*
|
/*
|
||||||
* We know that soft_offline_huge_page() tries to migrate
|
* We know that soft_offline_huge_page() tries to migrate
|
||||||
* only one hugepage pointed to by hpage, so we need not
|
* only one hugepage pointed to by hpage, so we need not
|
||||||
|
@ -1677,14 +1678,14 @@ static int __soft_offline_page(struct page *page, int flags)
|
||||||
if (!list_empty(&pagelist))
|
if (!list_empty(&pagelist))
|
||||||
putback_movable_pages(&pagelist);
|
putback_movable_pages(&pagelist);
|
||||||
|
|
||||||
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
|
pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
|
||||||
pfn, ret, page->flags);
|
pfn, ret, page->flags, &page->flags);
|
||||||
if (ret > 0)
|
if (ret > 0)
|
||||||
ret = -EIO;
|
ret = -EIO;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
|
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
|
||||||
pfn, ret, page_count(page), page->flags);
|
pfn, ret, page_count(page), page->flags, &page->flags);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1208,7 +1208,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
|
||||||
|
|
||||||
arch_refresh_nodedata(nid, pgdat);
|
arch_refresh_nodedata(nid, pgdat);
|
||||||
} else {
|
} else {
|
||||||
/* Reset the nr_zones, order and classzone_idx before reuse */
|
/*
|
||||||
|
* Reset the nr_zones, order and classzone_idx before reuse.
|
||||||
|
* Note that kswapd will init kswapd_classzone_idx properly
|
||||||
|
* when it starts in the near future.
|
||||||
|
*/
|
||||||
pgdat->nr_zones = 0;
|
pgdat->nr_zones = 0;
|
||||||
pgdat->kswapd_order = 0;
|
pgdat->kswapd_order = 0;
|
||||||
pgdat->kswapd_classzone_idx = 0;
|
pgdat->kswapd_classzone_idx = 0;
|
||||||
|
|
10
mm/migrate.c
10
mm/migrate.c
|
@ -194,7 +194,7 @@ void putback_movable_pages(struct list_head *l)
|
||||||
/*
|
/*
|
||||||
* Restore a potential migration pte to a working pte entry
|
* Restore a potential migration pte to a working pte entry
|
||||||
*/
|
*/
|
||||||
static int remove_migration_pte(struct page *page, struct vm_area_struct *vma,
|
static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
|
||||||
unsigned long addr, void *old)
|
unsigned long addr, void *old)
|
||||||
{
|
{
|
||||||
struct page_vma_mapped_walk pvmw = {
|
struct page_vma_mapped_walk pvmw = {
|
||||||
|
@ -253,7 +253,7 @@ static int remove_migration_pte(struct page *page, struct vm_area_struct *vma,
|
||||||
update_mmu_cache(vma, pvmw.address, pvmw.pte);
|
update_mmu_cache(vma, pvmw.address, pvmw.pte);
|
||||||
}
|
}
|
||||||
|
|
||||||
return SWAP_AGAIN;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1722,9 +1722,6 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
|
||||||
{
|
{
|
||||||
int z;
|
int z;
|
||||||
|
|
||||||
if (!pgdat_reclaimable(pgdat))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
|
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
|
||||||
struct zone *zone = pgdat->node_zones + z;
|
struct zone *zone = pgdat->node_zones + z;
|
||||||
|
|
||||||
|
@ -1947,7 +1944,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
|
||||||
|
|
||||||
/* Prepare a page as a migration target */
|
/* Prepare a page as a migration target */
|
||||||
__SetPageLocked(new_page);
|
__SetPageLocked(new_page);
|
||||||
__SetPageSwapBacked(new_page);
|
if (PageSwapBacked(page))
|
||||||
|
__SetPageSwapBacked(new_page);
|
||||||
|
|
||||||
/* anon mapping, we can simply copy page->mapping to the new page: */
|
/* anon mapping, we can simply copy page->mapping to the new page: */
|
||||||
new_page->mapping = page->mapping;
|
new_page->mapping = page->mapping;
|
||||||
|
|
|
@ -123,17 +123,15 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
|
||||||
*/
|
*/
|
||||||
static void __munlock_isolated_page(struct page *page)
|
static void __munlock_isolated_page(struct page *page)
|
||||||
{
|
{
|
||||||
int ret = SWAP_AGAIN;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Optimization: if the page was mapped just once, that's our mapping
|
* Optimization: if the page was mapped just once, that's our mapping
|
||||||
* and we don't need to check all the other vmas.
|
* and we don't need to check all the other vmas.
|
||||||
*/
|
*/
|
||||||
if (page_mapcount(page) > 1)
|
if (page_mapcount(page) > 1)
|
||||||
ret = try_to_munlock(page);
|
try_to_munlock(page);
|
||||||
|
|
||||||
/* Did try_to_unlock() succeed or punt? */
|
/* Did try_to_unlock() succeed or punt? */
|
||||||
if (ret != SWAP_MLOCK)
|
if (!PageMlocked(page))
|
||||||
count_vm_event(UNEVICTABLE_PGMUNLOCKED);
|
count_vm_event(UNEVICTABLE_PGMUNLOCKED);
|
||||||
|
|
||||||
putback_lru_page(page);
|
putback_lru_page(page);
|
||||||
|
|
|
@ -1479,7 +1479,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
|
||||||
struct user_struct *user = NULL;
|
struct user_struct *user = NULL;
|
||||||
struct hstate *hs;
|
struct hstate *hs;
|
||||||
|
|
||||||
hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
|
hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
|
||||||
if (!hs)
|
if (!hs)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
|
|
|
@ -685,6 +685,7 @@ void exit_oom_victim(void)
|
||||||
void oom_killer_enable(void)
|
void oom_killer_enable(void)
|
||||||
{
|
{
|
||||||
oom_killer_disabled = false;
|
oom_killer_disabled = false;
|
||||||
|
pr_info("OOM killer enabled.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -721,6 +722,7 @@ bool oom_killer_disable(signed long timeout)
|
||||||
oom_killer_enable();
|
oom_killer_enable();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
pr_info("OOM killer disabled.\n");
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -650,9 +650,8 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
|
||||||
|
|
||||||
spin_lock_init(&dom->lock);
|
spin_lock_init(&dom->lock);
|
||||||
|
|
||||||
init_timer_deferrable(&dom->period_timer);
|
setup_deferrable_timer(&dom->period_timer, writeout_period,
|
||||||
dom->period_timer.function = writeout_period;
|
(unsigned long)dom);
|
||||||
dom->period_timer.data = (unsigned long)dom;
|
|
||||||
|
|
||||||
dom->dirty_limit_tstamp = jiffies;
|
dom->dirty_limit_tstamp = jiffies;
|
||||||
|
|
||||||
|
@ -2428,7 +2427,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
|
||||||
inode_attach_wb(inode, page);
|
inode_attach_wb(inode, page);
|
||||||
wb = inode_to_wb(inode);
|
wb = inode_to_wb(inode);
|
||||||
|
|
||||||
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
|
inc_memcg_page_state(page, NR_FILE_DIRTY);
|
||||||
__inc_node_page_state(page, NR_FILE_DIRTY);
|
__inc_node_page_state(page, NR_FILE_DIRTY);
|
||||||
__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
|
__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
|
||||||
__inc_node_page_state(page, NR_DIRTIED);
|
__inc_node_page_state(page, NR_DIRTIED);
|
||||||
|
@ -2450,7 +2449,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
|
||||||
struct bdi_writeback *wb)
|
struct bdi_writeback *wb)
|
||||||
{
|
{
|
||||||
if (mapping_cap_account_dirty(mapping)) {
|
if (mapping_cap_account_dirty(mapping)) {
|
||||||
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
|
dec_memcg_page_state(page, NR_FILE_DIRTY);
|
||||||
dec_node_page_state(page, NR_FILE_DIRTY);
|
dec_node_page_state(page, NR_FILE_DIRTY);
|
||||||
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
|
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
|
||||||
dec_wb_stat(wb, WB_RECLAIMABLE);
|
dec_wb_stat(wb, WB_RECLAIMABLE);
|
||||||
|
@ -2707,7 +2706,7 @@ int clear_page_dirty_for_io(struct page *page)
|
||||||
*/
|
*/
|
||||||
wb = unlocked_inode_to_wb_begin(inode, &locked);
|
wb = unlocked_inode_to_wb_begin(inode, &locked);
|
||||||
if (TestClearPageDirty(page)) {
|
if (TestClearPageDirty(page)) {
|
||||||
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
|
dec_memcg_page_state(page, NR_FILE_DIRTY);
|
||||||
dec_node_page_state(page, NR_FILE_DIRTY);
|
dec_node_page_state(page, NR_FILE_DIRTY);
|
||||||
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
|
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
|
||||||
dec_wb_stat(wb, WB_RECLAIMABLE);
|
dec_wb_stat(wb, WB_RECLAIMABLE);
|
||||||
|
@ -2754,7 +2753,7 @@ int test_clear_page_writeback(struct page *page)
|
||||||
ret = TestClearPageWriteback(page);
|
ret = TestClearPageWriteback(page);
|
||||||
}
|
}
|
||||||
if (ret) {
|
if (ret) {
|
||||||
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
|
dec_memcg_page_state(page, NR_WRITEBACK);
|
||||||
dec_node_page_state(page, NR_WRITEBACK);
|
dec_node_page_state(page, NR_WRITEBACK);
|
||||||
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
|
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
|
||||||
inc_node_page_state(page, NR_WRITTEN);
|
inc_node_page_state(page, NR_WRITTEN);
|
||||||
|
@ -2809,7 +2808,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
|
||||||
ret = TestSetPageWriteback(page);
|
ret = TestSetPageWriteback(page);
|
||||||
}
|
}
|
||||||
if (!ret) {
|
if (!ret) {
|
||||||
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
|
inc_memcg_page_state(page, NR_WRITEBACK);
|
||||||
inc_node_page_state(page, NR_WRITEBACK);
|
inc_node_page_state(page, NR_WRITEBACK);
|
||||||
inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
|
inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1090,14 +1090,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
|
||||||
{
|
{
|
||||||
int migratetype = 0;
|
int migratetype = 0;
|
||||||
int batch_free = 0;
|
int batch_free = 0;
|
||||||
unsigned long nr_scanned;
|
|
||||||
bool isolated_pageblocks;
|
bool isolated_pageblocks;
|
||||||
|
|
||||||
spin_lock(&zone->lock);
|
spin_lock(&zone->lock);
|
||||||
isolated_pageblocks = has_isolate_pageblock(zone);
|
isolated_pageblocks = has_isolate_pageblock(zone);
|
||||||
nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
|
|
||||||
if (nr_scanned)
|
|
||||||
__mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
|
|
||||||
|
|
||||||
while (count) {
|
while (count) {
|
||||||
struct page *page;
|
struct page *page;
|
||||||
|
@ -1150,12 +1146,7 @@ static void free_one_page(struct zone *zone,
|
||||||
unsigned int order,
|
unsigned int order,
|
||||||
int migratetype)
|
int migratetype)
|
||||||
{
|
{
|
||||||
unsigned long nr_scanned;
|
|
||||||
spin_lock(&zone->lock);
|
spin_lock(&zone->lock);
|
||||||
nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
|
|
||||||
if (nr_scanned)
|
|
||||||
__mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
|
|
||||||
|
|
||||||
if (unlikely(has_isolate_pageblock(zone) ||
|
if (unlikely(has_isolate_pageblock(zone) ||
|
||||||
is_migrate_isolate(migratetype))) {
|
is_migrate_isolate(migratetype))) {
|
||||||
migratetype = get_pfnblock_migratetype(page, pfn);
|
migratetype = get_pfnblock_migratetype(page, pfn);
|
||||||
|
@ -1698,10 +1689,10 @@ static inline int check_new_page(struct page *page)
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool free_pages_prezeroed(bool poisoned)
|
static inline bool free_pages_prezeroed(void)
|
||||||
{
|
{
|
||||||
return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
|
return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
|
||||||
page_poisoning_enabled() && poisoned;
|
page_poisoning_enabled();
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_DEBUG_VM
|
#ifdef CONFIG_DEBUG_VM
|
||||||
|
@ -1755,17 +1746,10 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
|
||||||
unsigned int alloc_flags)
|
unsigned int alloc_flags)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
bool poisoned = true;
|
|
||||||
|
|
||||||
for (i = 0; i < (1 << order); i++) {
|
|
||||||
struct page *p = page + i;
|
|
||||||
if (poisoned)
|
|
||||||
poisoned &= page_is_poisoned(p);
|
|
||||||
}
|
|
||||||
|
|
||||||
post_alloc_hook(page, order, gfp_flags);
|
post_alloc_hook(page, order, gfp_flags);
|
||||||
|
|
||||||
if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
|
if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
|
||||||
for (i = 0; i < (1 << order); i++)
|
for (i = 0; i < (1 << order); i++)
|
||||||
clear_highpage(page + i);
|
clear_highpage(page + i);
|
||||||
|
|
||||||
|
@ -2045,8 +2029,8 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
|
||||||
|
|
||||||
/* Yoink! */
|
/* Yoink! */
|
||||||
mt = get_pageblock_migratetype(page);
|
mt = get_pageblock_migratetype(page);
|
||||||
if (mt != MIGRATE_HIGHATOMIC &&
|
if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
|
||||||
!is_migrate_isolate(mt) && !is_migrate_cma(mt)) {
|
&& !is_migrate_cma(mt)) {
|
||||||
zone->nr_reserved_highatomic += pageblock_nr_pages;
|
zone->nr_reserved_highatomic += pageblock_nr_pages;
|
||||||
set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
|
set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
|
||||||
move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
|
move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
|
||||||
|
@ -2103,8 +2087,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
|
||||||
* from highatomic to ac->migratetype. So we should
|
* from highatomic to ac->migratetype. So we should
|
||||||
* adjust the count once.
|
* adjust the count once.
|
||||||
*/
|
*/
|
||||||
if (get_pageblock_migratetype(page) ==
|
if (is_migrate_highatomic_page(page)) {
|
||||||
MIGRATE_HIGHATOMIC) {
|
|
||||||
/*
|
/*
|
||||||
* It should never happen but changes to
|
* It should never happen but changes to
|
||||||
* locking could inadvertently allow a per-cpu
|
* locking could inadvertently allow a per-cpu
|
||||||
|
@ -2161,8 +2144,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
|
||||||
|
|
||||||
page = list_first_entry(&area->free_list[fallback_mt],
|
page = list_first_entry(&area->free_list[fallback_mt],
|
||||||
struct page, lru);
|
struct page, lru);
|
||||||
if (can_steal &&
|
if (can_steal && !is_migrate_highatomic_page(page))
|
||||||
get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
|
|
||||||
steal_suitable_fallback(zone, page, start_migratetype);
|
steal_suitable_fallback(zone, page, start_migratetype);
|
||||||
|
|
||||||
/* Remove the page from the freelists */
|
/* Remove the page from the freelists */
|
||||||
|
@ -2502,7 +2484,7 @@ void free_hot_cold_page(struct page *page, bool cold)
|
||||||
/*
|
/*
|
||||||
* We only track unmovable, reclaimable and movable on pcp lists.
|
* We only track unmovable, reclaimable and movable on pcp lists.
|
||||||
* Free ISOLATE pages back to the allocator because they are being
|
* Free ISOLATE pages back to the allocator because they are being
|
||||||
* offlined but treat RESERVE as movable pages so we can get those
|
* offlined but treat HIGHATOMIC as movable pages so we can get those
|
||||||
* areas back if necessary. Otherwise, we may have to free
|
* areas back if necessary. Otherwise, we may have to free
|
||||||
* excessively into the page allocator
|
* excessively into the page allocator
|
||||||
*/
|
*/
|
||||||
|
@ -2612,7 +2594,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
|
||||||
for (; page < endpage; page += pageblock_nr_pages) {
|
for (; page < endpage; page += pageblock_nr_pages) {
|
||||||
int mt = get_pageblock_migratetype(page);
|
int mt = get_pageblock_migratetype(page);
|
||||||
if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
|
if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
|
||||||
&& mt != MIGRATE_HIGHATOMIC)
|
&& !is_migrate_highatomic(mt))
|
||||||
set_pageblock_migratetype(page,
|
set_pageblock_migratetype(page,
|
||||||
MIGRATE_MOVABLE);
|
MIGRATE_MOVABLE);
|
||||||
}
|
}
|
||||||
|
@ -3110,8 +3092,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
|
||||||
static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
|
static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
|
||||||
DEFAULT_RATELIMIT_BURST);
|
DEFAULT_RATELIMIT_BURST);
|
||||||
|
|
||||||
if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
|
if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
|
||||||
debug_guardpage_minorder() > 0)
|
|
||||||
return;
|
return;
|
||||||
|
|
||||||
pr_warn("%s: ", current->comm);
|
pr_warn("%s: ", current->comm);
|
||||||
|
@ -3521,20 +3502,13 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Maximum number of reclaim retries without any progress before OOM killer
|
|
||||||
* is consider as the only way to move forward.
|
|
||||||
*/
|
|
||||||
#define MAX_RECLAIM_RETRIES 16
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Checks whether it makes sense to retry the reclaim to make a forward progress
|
* Checks whether it makes sense to retry the reclaim to make a forward progress
|
||||||
* for the given allocation request.
|
* for the given allocation request.
|
||||||
* The reclaim feedback represented by did_some_progress (any progress during
|
*
|
||||||
* the last reclaim round) and no_progress_loops (number of reclaim rounds without
|
* We give up when we either have tried MAX_RECLAIM_RETRIES in a row
|
||||||
* any progress in a row) is considered as well as the reclaimable pages on the
|
* without success, or when we couldn't even meet the watermark if we
|
||||||
* applicable zone list (with a backoff mechanism which is a function of
|
* reclaimed all remaining pages on the LRU lists.
|
||||||
* no_progress_loops).
|
|
||||||
*
|
*
|
||||||
* Returns true if a retry is viable or false to enter the oom path.
|
* Returns true if a retry is viable or false to enter the oom path.
|
||||||
*/
|
*/
|
||||||
|
@ -3579,13 +3553,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
|
||||||
bool wmark;
|
bool wmark;
|
||||||
|
|
||||||
available = reclaimable = zone_reclaimable_pages(zone);
|
available = reclaimable = zone_reclaimable_pages(zone);
|
||||||
available -= DIV_ROUND_UP((*no_progress_loops) * available,
|
|
||||||
MAX_RECLAIM_RETRIES);
|
|
||||||
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
|
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Would the allocation succeed if we reclaimed the whole
|
* Would the allocation succeed if we reclaimed all
|
||||||
* available?
|
* reclaimable pages?
|
||||||
*/
|
*/
|
||||||
wmark = __zone_watermark_ok(zone, order, min_wmark,
|
wmark = __zone_watermark_ok(zone, order, min_wmark,
|
||||||
ac_classzone_idx(ac), alloc_flags, available);
|
ac_classzone_idx(ac), alloc_flags, available);
|
||||||
|
@ -3771,7 +3743,7 @@ retry:
|
||||||
|
|
||||||
/* Make sure we know about allocations which stall for too long */
|
/* Make sure we know about allocations which stall for too long */
|
||||||
if (time_after(jiffies, alloc_start + stall_timeout)) {
|
if (time_after(jiffies, alloc_start + stall_timeout)) {
|
||||||
warn_alloc(gfp_mask, ac->nodemask,
|
warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,
|
||||||
"page allocation stalls for %ums, order:%u",
|
"page allocation stalls for %ums, order:%u",
|
||||||
jiffies_to_msecs(jiffies-alloc_start), order);
|
jiffies_to_msecs(jiffies-alloc_start), order);
|
||||||
stall_timeout += 10 * HZ;
|
stall_timeout += 10 * HZ;
|
||||||
|
@ -3971,10 +3943,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Runtime PM, block IO and its error handling path can deadlock
|
* Apply scoped allocation constraints. This is mainly about GFP_NOFS
|
||||||
* because I/O on the device might not complete.
|
* resp. GFP_NOIO which has to be inherited for all allocation requests
|
||||||
|
* from a particular context which has been marked by
|
||||||
|
* memalloc_no{fs,io}_{save,restore}.
|
||||||
*/
|
*/
|
||||||
alloc_mask = memalloc_noio_flags(gfp_mask);
|
alloc_mask = current_gfp_context(gfp_mask);
|
||||||
ac.spread_dirty_pages = false;
|
ac.spread_dirty_pages = false;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -4510,7 +4484,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
|
||||||
#endif
|
#endif
|
||||||
" writeback_tmp:%lukB"
|
" writeback_tmp:%lukB"
|
||||||
" unstable:%lukB"
|
" unstable:%lukB"
|
||||||
" pages_scanned:%lu"
|
|
||||||
" all_unreclaimable? %s"
|
" all_unreclaimable? %s"
|
||||||
"\n",
|
"\n",
|
||||||
pgdat->node_id,
|
pgdat->node_id,
|
||||||
|
@ -4533,8 +4506,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
|
||||||
#endif
|
#endif
|
||||||
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
|
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
|
||||||
K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
|
K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
|
||||||
node_page_state(pgdat, NR_PAGES_SCANNED),
|
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
|
||||||
!pgdat_reclaimable(pgdat) ? "yes" : "no");
|
"yes" : "no");
|
||||||
}
|
}
|
||||||
|
|
||||||
for_each_populated_zone(zone) {
|
for_each_populated_zone(zone) {
|
||||||
|
@ -7429,7 +7402,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
|
||||||
.zone = page_zone(pfn_to_page(start)),
|
.zone = page_zone(pfn_to_page(start)),
|
||||||
.mode = MIGRATE_SYNC,
|
.mode = MIGRATE_SYNC,
|
||||||
.ignore_skip_hint = true,
|
.ignore_skip_hint = true,
|
||||||
.gfp_mask = memalloc_noio_flags(gfp_mask),
|
.gfp_mask = current_gfp_context(gfp_mask),
|
||||||
};
|
};
|
||||||
INIT_LIST_HEAD(&cc.migratepages);
|
INIT_LIST_HEAD(&cc.migratepages);
|
||||||
|
|
||||||
|
|
|
@ -59,9 +59,6 @@
|
||||||
|
|
||||||
static struct page_ext_operations *page_ext_ops[] = {
|
static struct page_ext_operations *page_ext_ops[] = {
|
||||||
&debug_guardpage_ops,
|
&debug_guardpage_ops,
|
||||||
#ifdef CONFIG_PAGE_POISONING
|
|
||||||
&page_poisoning_ops,
|
|
||||||
#endif
|
|
||||||
#ifdef CONFIG_PAGE_OWNER
|
#ifdef CONFIG_PAGE_OWNER
|
||||||
&page_owner_ops,
|
&page_owner_ops,
|
||||||
#endif
|
#endif
|
||||||
|
@ -127,15 +124,12 @@ struct page_ext *lookup_page_ext(struct page *page)
|
||||||
struct page_ext *base;
|
struct page_ext *base;
|
||||||
|
|
||||||
base = NODE_DATA(page_to_nid(page))->node_page_ext;
|
base = NODE_DATA(page_to_nid(page))->node_page_ext;
|
||||||
#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
|
#if defined(CONFIG_DEBUG_VM)
|
||||||
/*
|
/*
|
||||||
* The sanity checks the page allocator does upon freeing a
|
* The sanity checks the page allocator does upon freeing a
|
||||||
* page can reach here before the page_ext arrays are
|
* page can reach here before the page_ext arrays are
|
||||||
* allocated when feeding a range of pages to the allocator
|
* allocated when feeding a range of pages to the allocator
|
||||||
* for the first time during bootup or memory hotplug.
|
* for the first time during bootup or memory hotplug.
|
||||||
*
|
|
||||||
* This check is also necessary for ensuring page poisoning
|
|
||||||
* works as expected when enabled
|
|
||||||
*/
|
*/
|
||||||
if (unlikely(!base))
|
if (unlikely(!base))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -204,15 +198,12 @@ struct page_ext *lookup_page_ext(struct page *page)
|
||||||
{
|
{
|
||||||
unsigned long pfn = page_to_pfn(page);
|
unsigned long pfn = page_to_pfn(page);
|
||||||
struct mem_section *section = __pfn_to_section(pfn);
|
struct mem_section *section = __pfn_to_section(pfn);
|
||||||
#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
|
#if defined(CONFIG_DEBUG_VM)
|
||||||
/*
|
/*
|
||||||
* The sanity checks the page allocator does upon freeing a
|
* The sanity checks the page allocator does upon freeing a
|
||||||
* page can reach here before the page_ext arrays are
|
* page can reach here before the page_ext arrays are
|
||||||
* allocated when feeding a range of pages to the allocator
|
* allocated when feeding a range of pages to the allocator
|
||||||
* for the first time during bootup or memory hotplug.
|
* for the first time during bootup or memory hotplug.
|
||||||
*
|
|
||||||
* This check is also necessary for ensuring page poisoning
|
|
||||||
* works as expected when enabled
|
|
||||||
*/
|
*/
|
||||||
if (!section->page_ext)
|
if (!section->page_ext)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
|
@ -50,7 +50,7 @@ static struct page *page_idle_get_page(unsigned long pfn)
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int page_idle_clear_pte_refs_one(struct page *page,
|
static bool page_idle_clear_pte_refs_one(struct page *page,
|
||||||
struct vm_area_struct *vma,
|
struct vm_area_struct *vma,
|
||||||
unsigned long addr, void *arg)
|
unsigned long addr, void *arg)
|
||||||
{
|
{
|
||||||
|
@ -84,7 +84,7 @@ static int page_idle_clear_pte_refs_one(struct page *page,
|
||||||
*/
|
*/
|
||||||
set_page_young(page);
|
set_page_young(page);
|
||||||
}
|
}
|
||||||
return SWAP_AGAIN;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void page_idle_clear_pte_refs(struct page *page)
|
static void page_idle_clear_pte_refs(struct page *page)
|
||||||
|
|
|
@ -88,7 +88,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
|
||||||
|
|
||||||
zone = page_zone(page);
|
zone = page_zone(page);
|
||||||
spin_lock_irqsave(&zone->lock, flags);
|
spin_lock_irqsave(&zone->lock, flags);
|
||||||
if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
|
if (!is_migrate_isolate_page(page))
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -205,7 +205,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
|
||||||
pfn < end_pfn;
|
pfn < end_pfn;
|
||||||
pfn += pageblock_nr_pages) {
|
pfn += pageblock_nr_pages) {
|
||||||
page = __first_valid_page(pfn, pageblock_nr_pages);
|
page = __first_valid_page(pfn, pageblock_nr_pages);
|
||||||
if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
|
if (!page || !is_migrate_isolate_page(page))
|
||||||
continue;
|
continue;
|
||||||
unset_migratetype_isolate(page, migratetype);
|
unset_migratetype_isolate(page, migratetype);
|
||||||
}
|
}
|
||||||
|
@ -262,7 +262,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
|
||||||
*/
|
*/
|
||||||
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
|
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
|
||||||
page = __first_valid_page(pfn, pageblock_nr_pages);
|
page = __first_valid_page(pfn, pageblock_nr_pages);
|
||||||
if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
|
if (page && !is_migrate_isolate_page(page))
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
page = __first_valid_page(start_pfn, end_pfn - start_pfn);
|
page = __first_valid_page(start_pfn, end_pfn - start_pfn);
|
||||||
|
|
|
@ -6,7 +6,6 @@
|
||||||
#include <linux/poison.h>
|
#include <linux/poison.h>
|
||||||
#include <linux/ratelimit.h>
|
#include <linux/ratelimit.h>
|
||||||
|
|
||||||
static bool __page_poisoning_enabled __read_mostly;
|
|
||||||
static bool want_page_poisoning __read_mostly;
|
static bool want_page_poisoning __read_mostly;
|
||||||
|
|
||||||
static int early_page_poison_param(char *buf)
|
static int early_page_poison_param(char *buf)
|
||||||
|
@ -18,75 +17,22 @@ static int early_page_poison_param(char *buf)
|
||||||
early_param("page_poison", early_page_poison_param);
|
early_param("page_poison", early_page_poison_param);
|
||||||
|
|
||||||
bool page_poisoning_enabled(void)
|
bool page_poisoning_enabled(void)
|
||||||
{
|
|
||||||
return __page_poisoning_enabled;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool need_page_poisoning(void)
|
|
||||||
{
|
|
||||||
return want_page_poisoning;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void init_page_poisoning(void)
|
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* page poisoning is debug page alloc for some arches. If either
|
* Assumes that debug_pagealloc_enabled is set before
|
||||||
* of those options are enabled, enable poisoning
|
* free_all_bootmem.
|
||||||
|
* Page poisoning is debug page alloc for some arches. If
|
||||||
|
* either of those options are enabled, enable poisoning.
|
||||||
*/
|
*/
|
||||||
if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) {
|
return (want_page_poisoning ||
|
||||||
if (!want_page_poisoning && !debug_pagealloc_enabled())
|
(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
|
||||||
return;
|
debug_pagealloc_enabled()));
|
||||||
} else {
|
|
||||||
if (!want_page_poisoning)
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
__page_poisoning_enabled = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct page_ext_operations page_poisoning_ops = {
|
|
||||||
.need = need_page_poisoning,
|
|
||||||
.init = init_page_poisoning,
|
|
||||||
};
|
|
||||||
|
|
||||||
static inline void set_page_poison(struct page *page)
|
|
||||||
{
|
|
||||||
struct page_ext *page_ext;
|
|
||||||
|
|
||||||
page_ext = lookup_page_ext(page);
|
|
||||||
if (unlikely(!page_ext))
|
|
||||||
return;
|
|
||||||
|
|
||||||
__set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void clear_page_poison(struct page *page)
|
|
||||||
{
|
|
||||||
struct page_ext *page_ext;
|
|
||||||
|
|
||||||
page_ext = lookup_page_ext(page);
|
|
||||||
if (unlikely(!page_ext))
|
|
||||||
return;
|
|
||||||
|
|
||||||
__clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool page_is_poisoned(struct page *page)
|
|
||||||
{
|
|
||||||
struct page_ext *page_ext;
|
|
||||||
|
|
||||||
page_ext = lookup_page_ext(page);
|
|
||||||
if (unlikely(!page_ext))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void poison_page(struct page *page)
|
static void poison_page(struct page *page)
|
||||||
{
|
{
|
||||||
void *addr = kmap_atomic(page);
|
void *addr = kmap_atomic(page);
|
||||||
|
|
||||||
set_page_poison(page);
|
|
||||||
memset(addr, PAGE_POISON, PAGE_SIZE);
|
memset(addr, PAGE_POISON, PAGE_SIZE);
|
||||||
kunmap_atomic(addr);
|
kunmap_atomic(addr);
|
||||||
}
|
}
|
||||||
|
@ -140,12 +86,13 @@ static void unpoison_page(struct page *page)
|
||||||
{
|
{
|
||||||
void *addr;
|
void *addr;
|
||||||
|
|
||||||
if (!page_is_poisoned(page))
|
|
||||||
return;
|
|
||||||
|
|
||||||
addr = kmap_atomic(page);
|
addr = kmap_atomic(page);
|
||||||
|
/*
|
||||||
|
* Page poisoning when enabled poisons each and every page
|
||||||
|
* that is freed to buddy. Thus no extra check is done to
|
||||||
|
* see if a page was posioned.
|
||||||
|
*/
|
||||||
check_poison_mem(addr, PAGE_SIZE);
|
check_poison_mem(addr, PAGE_SIZE);
|
||||||
clear_page_poison(page);
|
|
||||||
kunmap_atomic(addr);
|
kunmap_atomic(addr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
148
mm/rmap.c
148
mm/rmap.c
|
@ -724,7 +724,7 @@ struct page_referenced_arg {
|
||||||
/*
|
/*
|
||||||
* arg: page_referenced_arg will be passed
|
* arg: page_referenced_arg will be passed
|
||||||
*/
|
*/
|
||||||
static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
|
static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
|
||||||
unsigned long address, void *arg)
|
unsigned long address, void *arg)
|
||||||
{
|
{
|
||||||
struct page_referenced_arg *pra = arg;
|
struct page_referenced_arg *pra = arg;
|
||||||
|
@ -741,7 +741,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
|
||||||
if (vma->vm_flags & VM_LOCKED) {
|
if (vma->vm_flags & VM_LOCKED) {
|
||||||
page_vma_mapped_walk_done(&pvmw);
|
page_vma_mapped_walk_done(&pvmw);
|
||||||
pra->vm_flags |= VM_LOCKED;
|
pra->vm_flags |= VM_LOCKED;
|
||||||
return SWAP_FAIL; /* To break the loop */
|
return false; /* To break the loop */
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pvmw.pte) {
|
if (pvmw.pte) {
|
||||||
|
@ -781,9 +781,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!pra->mapcount)
|
if (!pra->mapcount)
|
||||||
return SWAP_SUCCESS; /* To break the loop */
|
return false; /* To break the loop */
|
||||||
|
|
||||||
return SWAP_AGAIN;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
|
static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
|
||||||
|
@ -812,7 +812,6 @@ int page_referenced(struct page *page,
|
||||||
struct mem_cgroup *memcg,
|
struct mem_cgroup *memcg,
|
||||||
unsigned long *vm_flags)
|
unsigned long *vm_flags)
|
||||||
{
|
{
|
||||||
int ret;
|
|
||||||
int we_locked = 0;
|
int we_locked = 0;
|
||||||
struct page_referenced_arg pra = {
|
struct page_referenced_arg pra = {
|
||||||
.mapcount = total_mapcount(page),
|
.mapcount = total_mapcount(page),
|
||||||
|
@ -846,7 +845,7 @@ int page_referenced(struct page *page,
|
||||||
rwc.invalid_vma = invalid_page_referenced_vma;
|
rwc.invalid_vma = invalid_page_referenced_vma;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = rmap_walk(page, &rwc);
|
rmap_walk(page, &rwc);
|
||||||
*vm_flags = pra.vm_flags;
|
*vm_flags = pra.vm_flags;
|
||||||
|
|
||||||
if (we_locked)
|
if (we_locked)
|
||||||
|
@ -855,7 +854,7 @@ int page_referenced(struct page *page,
|
||||||
return pra.referenced;
|
return pra.referenced;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
|
static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
|
||||||
unsigned long address, void *arg)
|
unsigned long address, void *arg)
|
||||||
{
|
{
|
||||||
struct page_vma_mapped_walk pvmw = {
|
struct page_vma_mapped_walk pvmw = {
|
||||||
|
@ -908,7 +907,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return SWAP_AGAIN;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
|
static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
|
||||||
|
@ -1159,7 +1158,7 @@ void page_add_file_rmap(struct page *page, bool compound)
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr);
|
__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr);
|
||||||
mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, nr);
|
mod_memcg_page_state(page, NR_FILE_MAPPED, nr);
|
||||||
out:
|
out:
|
||||||
unlock_page_memcg(page);
|
unlock_page_memcg(page);
|
||||||
}
|
}
|
||||||
|
@ -1199,7 +1198,7 @@ static void page_remove_file_rmap(struct page *page, bool compound)
|
||||||
* pte lock(a spinlock) is held, which implies preemption disabled.
|
* pte lock(a spinlock) is held, which implies preemption disabled.
|
||||||
*/
|
*/
|
||||||
__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr);
|
__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr);
|
||||||
mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -nr);
|
mod_memcg_page_state(page, NR_FILE_MAPPED, -nr);
|
||||||
|
|
||||||
if (unlikely(PageMlocked(page)))
|
if (unlikely(PageMlocked(page)))
|
||||||
clear_page_mlock(page);
|
clear_page_mlock(page);
|
||||||
|
@ -1288,15 +1287,10 @@ void page_remove_rmap(struct page *page, bool compound)
|
||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
struct rmap_private {
|
|
||||||
enum ttu_flags flags;
|
|
||||||
int lazyfreed;
|
|
||||||
};
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* @arg: enum ttu_flags will be passed to this argument
|
* @arg: enum ttu_flags will be passed to this argument
|
||||||
*/
|
*/
|
||||||
static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
|
static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
|
||||||
unsigned long address, void *arg)
|
unsigned long address, void *arg)
|
||||||
{
|
{
|
||||||
struct mm_struct *mm = vma->vm_mm;
|
struct mm_struct *mm = vma->vm_mm;
|
||||||
|
@ -1307,13 +1301,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
|
||||||
};
|
};
|
||||||
pte_t pteval;
|
pte_t pteval;
|
||||||
struct page *subpage;
|
struct page *subpage;
|
||||||
int ret = SWAP_AGAIN;
|
bool ret = true;
|
||||||
struct rmap_private *rp = arg;
|
enum ttu_flags flags = (enum ttu_flags)arg;
|
||||||
enum ttu_flags flags = rp->flags;
|
|
||||||
|
|
||||||
/* munlock has nothing to gain from examining un-locked vmas */
|
/* munlock has nothing to gain from examining un-locked vmas */
|
||||||
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
|
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
|
||||||
return SWAP_AGAIN;
|
return true;
|
||||||
|
|
||||||
if (flags & TTU_SPLIT_HUGE_PMD) {
|
if (flags & TTU_SPLIT_HUGE_PMD) {
|
||||||
split_huge_pmd_address(vma, address,
|
split_huge_pmd_address(vma, address,
|
||||||
|
@ -1336,7 +1329,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
|
||||||
*/
|
*/
|
||||||
mlock_vma_page(page);
|
mlock_vma_page(page);
|
||||||
}
|
}
|
||||||
ret = SWAP_MLOCK;
|
ret = false;
|
||||||
page_vma_mapped_walk_done(&pvmw);
|
page_vma_mapped_walk_done(&pvmw);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1354,7 +1347,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
|
||||||
if (!(flags & TTU_IGNORE_ACCESS)) {
|
if (!(flags & TTU_IGNORE_ACCESS)) {
|
||||||
if (ptep_clear_flush_young_notify(vma, address,
|
if (ptep_clear_flush_young_notify(vma, address,
|
||||||
pvmw.pte)) {
|
pvmw.pte)) {
|
||||||
ret = SWAP_FAIL;
|
ret = false;
|
||||||
page_vma_mapped_walk_done(&pvmw);
|
page_vma_mapped_walk_done(&pvmw);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1424,18 +1417,34 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
|
||||||
* Store the swap location in the pte.
|
* Store the swap location in the pte.
|
||||||
* See handle_pte_fault() ...
|
* See handle_pte_fault() ...
|
||||||
*/
|
*/
|
||||||
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
|
if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
|
||||||
|
WARN_ON_ONCE(1);
|
||||||
|
ret = false;
|
||||||
|
page_vma_mapped_walk_done(&pvmw);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
if (!PageDirty(page) && (flags & TTU_LZFREE)) {
|
/* MADV_FREE page check */
|
||||||
/* It's a freeable page by MADV_FREE */
|
if (!PageSwapBacked(page)) {
|
||||||
dec_mm_counter(mm, MM_ANONPAGES);
|
if (!PageDirty(page)) {
|
||||||
rp->lazyfreed++;
|
dec_mm_counter(mm, MM_ANONPAGES);
|
||||||
goto discard;
|
goto discard;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the page was redirtied, it cannot be
|
||||||
|
* discarded. Remap the page to page table.
|
||||||
|
*/
|
||||||
|
set_pte_at(mm, address, pvmw.pte, pteval);
|
||||||
|
SetPageSwapBacked(page);
|
||||||
|
ret = false;
|
||||||
|
page_vma_mapped_walk_done(&pvmw);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (swap_duplicate(entry) < 0) {
|
if (swap_duplicate(entry) < 0) {
|
||||||
set_pte_at(mm, address, pvmw.pte, pteval);
|
set_pte_at(mm, address, pvmw.pte, pteval);
|
||||||
ret = SWAP_FAIL;
|
ret = false;
|
||||||
page_vma_mapped_walk_done(&pvmw);
|
page_vma_mapped_walk_done(&pvmw);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1492,24 +1501,14 @@ static int page_mapcount_is_zero(struct page *page)
|
||||||
*
|
*
|
||||||
* Tries to remove all the page table entries which are mapping this
|
* Tries to remove all the page table entries which are mapping this
|
||||||
* page, used in the pageout path. Caller must hold the page lock.
|
* page, used in the pageout path. Caller must hold the page lock.
|
||||||
* Return values are:
|
|
||||||
*
|
*
|
||||||
* SWAP_SUCCESS - we succeeded in removing all mappings
|
* If unmap is successful, return true. Otherwise, false.
|
||||||
* SWAP_AGAIN - we missed a mapping, try again later
|
|
||||||
* SWAP_FAIL - the page is unswappable
|
|
||||||
* SWAP_MLOCK - page is mlocked.
|
|
||||||
*/
|
*/
|
||||||
int try_to_unmap(struct page *page, enum ttu_flags flags)
|
bool try_to_unmap(struct page *page, enum ttu_flags flags)
|
||||||
{
|
{
|
||||||
int ret;
|
|
||||||
struct rmap_private rp = {
|
|
||||||
.flags = flags,
|
|
||||||
.lazyfreed = 0,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct rmap_walk_control rwc = {
|
struct rmap_walk_control rwc = {
|
||||||
.rmap_one = try_to_unmap_one,
|
.rmap_one = try_to_unmap_one,
|
||||||
.arg = &rp,
|
.arg = (void *)flags,
|
||||||
.done = page_mapcount_is_zero,
|
.done = page_mapcount_is_zero,
|
||||||
.anon_lock = page_lock_anon_vma_read,
|
.anon_lock = page_lock_anon_vma_read,
|
||||||
};
|
};
|
||||||
|
@ -1526,16 +1525,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
|
||||||
rwc.invalid_vma = invalid_migration_vma;
|
rwc.invalid_vma = invalid_migration_vma;
|
||||||
|
|
||||||
if (flags & TTU_RMAP_LOCKED)
|
if (flags & TTU_RMAP_LOCKED)
|
||||||
ret = rmap_walk_locked(page, &rwc);
|
rmap_walk_locked(page, &rwc);
|
||||||
else
|
else
|
||||||
ret = rmap_walk(page, &rwc);
|
rmap_walk(page, &rwc);
|
||||||
|
|
||||||
if (ret != SWAP_MLOCK && !page_mapcount(page)) {
|
return !page_mapcount(page) ? true : false;
|
||||||
ret = SWAP_SUCCESS;
|
|
||||||
if (rp.lazyfreed && !PageDirty(page))
|
|
||||||
ret = SWAP_LZFREE;
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int page_not_mapped(struct page *page)
|
static int page_not_mapped(struct page *page)
|
||||||
|
@ -1550,34 +1544,22 @@ static int page_not_mapped(struct page *page)
|
||||||
* Called from munlock code. Checks all of the VMAs mapping the page
|
* Called from munlock code. Checks all of the VMAs mapping the page
|
||||||
* to make sure nobody else has this page mlocked. The page will be
|
* to make sure nobody else has this page mlocked. The page will be
|
||||||
* returned with PG_mlocked cleared if no other vmas have it mlocked.
|
* returned with PG_mlocked cleared if no other vmas have it mlocked.
|
||||||
*
|
|
||||||
* Return values are:
|
|
||||||
*
|
|
||||||
* SWAP_AGAIN - no vma is holding page mlocked, or,
|
|
||||||
* SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
|
|
||||||
* SWAP_FAIL - page cannot be located at present
|
|
||||||
* SWAP_MLOCK - page is now mlocked.
|
|
||||||
*/
|
*/
|
||||||
int try_to_munlock(struct page *page)
|
|
||||||
{
|
|
||||||
int ret;
|
|
||||||
struct rmap_private rp = {
|
|
||||||
.flags = TTU_MUNLOCK,
|
|
||||||
.lazyfreed = 0,
|
|
||||||
};
|
|
||||||
|
|
||||||
|
void try_to_munlock(struct page *page)
|
||||||
|
{
|
||||||
struct rmap_walk_control rwc = {
|
struct rmap_walk_control rwc = {
|
||||||
.rmap_one = try_to_unmap_one,
|
.rmap_one = try_to_unmap_one,
|
||||||
.arg = &rp,
|
.arg = (void *)TTU_MUNLOCK,
|
||||||
.done = page_not_mapped,
|
.done = page_not_mapped,
|
||||||
.anon_lock = page_lock_anon_vma_read,
|
.anon_lock = page_lock_anon_vma_read,
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
|
VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
|
||||||
|
VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
|
||||||
|
|
||||||
ret = rmap_walk(page, &rwc);
|
rmap_walk(page, &rwc);
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void __put_anon_vma(struct anon_vma *anon_vma)
|
void __put_anon_vma(struct anon_vma *anon_vma)
|
||||||
|
@ -1625,13 +1607,12 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
|
||||||
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
|
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
|
||||||
* LOCKED.
|
* LOCKED.
|
||||||
*/
|
*/
|
||||||
static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
|
static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
|
||||||
bool locked)
|
bool locked)
|
||||||
{
|
{
|
||||||
struct anon_vma *anon_vma;
|
struct anon_vma *anon_vma;
|
||||||
pgoff_t pgoff_start, pgoff_end;
|
pgoff_t pgoff_start, pgoff_end;
|
||||||
struct anon_vma_chain *avc;
|
struct anon_vma_chain *avc;
|
||||||
int ret = SWAP_AGAIN;
|
|
||||||
|
|
||||||
if (locked) {
|
if (locked) {
|
||||||
anon_vma = page_anon_vma(page);
|
anon_vma = page_anon_vma(page);
|
||||||
|
@ -1641,7 +1622,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
|
||||||
anon_vma = rmap_walk_anon_lock(page, rwc);
|
anon_vma = rmap_walk_anon_lock(page, rwc);
|
||||||
}
|
}
|
||||||
if (!anon_vma)
|
if (!anon_vma)
|
||||||
return ret;
|
return;
|
||||||
|
|
||||||
pgoff_start = page_to_pgoff(page);
|
pgoff_start = page_to_pgoff(page);
|
||||||
pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
|
pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
|
||||||
|
@ -1655,8 +1636,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
|
||||||
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
|
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
ret = rwc->rmap_one(page, vma, address, rwc->arg);
|
if (!rwc->rmap_one(page, vma, address, rwc->arg))
|
||||||
if (ret != SWAP_AGAIN)
|
|
||||||
break;
|
break;
|
||||||
if (rwc->done && rwc->done(page))
|
if (rwc->done && rwc->done(page))
|
||||||
break;
|
break;
|
||||||
|
@ -1664,7 +1644,6 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
|
||||||
|
|
||||||
if (!locked)
|
if (!locked)
|
||||||
anon_vma_unlock_read(anon_vma);
|
anon_vma_unlock_read(anon_vma);
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1680,13 +1659,12 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
|
||||||
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
|
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
|
||||||
* LOCKED.
|
* LOCKED.
|
||||||
*/
|
*/
|
||||||
static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
|
static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
|
||||||
bool locked)
|
bool locked)
|
||||||
{
|
{
|
||||||
struct address_space *mapping = page_mapping(page);
|
struct address_space *mapping = page_mapping(page);
|
||||||
pgoff_t pgoff_start, pgoff_end;
|
pgoff_t pgoff_start, pgoff_end;
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
int ret = SWAP_AGAIN;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The page lock not only makes sure that page->mapping cannot
|
* The page lock not only makes sure that page->mapping cannot
|
||||||
|
@ -1697,7 +1675,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
|
||||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||||
|
|
||||||
if (!mapping)
|
if (!mapping)
|
||||||
return ret;
|
return;
|
||||||
|
|
||||||
pgoff_start = page_to_pgoff(page);
|
pgoff_start = page_to_pgoff(page);
|
||||||
pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
|
pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
|
||||||
|
@ -1712,8 +1690,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
|
||||||
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
|
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
ret = rwc->rmap_one(page, vma, address, rwc->arg);
|
if (!rwc->rmap_one(page, vma, address, rwc->arg))
|
||||||
if (ret != SWAP_AGAIN)
|
|
||||||
goto done;
|
goto done;
|
||||||
if (rwc->done && rwc->done(page))
|
if (rwc->done && rwc->done(page))
|
||||||
goto done;
|
goto done;
|
||||||
|
@ -1722,28 +1699,27 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
|
||||||
done:
|
done:
|
||||||
if (!locked)
|
if (!locked)
|
||||||
i_mmap_unlock_read(mapping);
|
i_mmap_unlock_read(mapping);
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
|
void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
|
||||||
{
|
{
|
||||||
if (unlikely(PageKsm(page)))
|
if (unlikely(PageKsm(page)))
|
||||||
return rmap_walk_ksm(page, rwc);
|
rmap_walk_ksm(page, rwc);
|
||||||
else if (PageAnon(page))
|
else if (PageAnon(page))
|
||||||
return rmap_walk_anon(page, rwc, false);
|
rmap_walk_anon(page, rwc, false);
|
||||||
else
|
else
|
||||||
return rmap_walk_file(page, rwc, false);
|
rmap_walk_file(page, rwc, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Like rmap_walk, but caller holds relevant rmap lock */
|
/* Like rmap_walk, but caller holds relevant rmap lock */
|
||||||
int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
|
void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
|
||||||
{
|
{
|
||||||
/* no ksm support for now */
|
/* no ksm support for now */
|
||||||
VM_BUG_ON_PAGE(PageKsm(page), page);
|
VM_BUG_ON_PAGE(PageKsm(page), page);
|
||||||
if (PageAnon(page))
|
if (PageAnon(page))
|
||||||
return rmap_walk_anon(page, rwc, true);
|
rmap_walk_anon(page, rwc, true);
|
||||||
else
|
else
|
||||||
return rmap_walk_file(page, rwc, true);
|
rmap_walk_file(page, rwc, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_HUGETLB_PAGE
|
#ifdef CONFIG_HUGETLB_PAGE
|
||||||
|
|
|
@ -9,11 +9,12 @@
|
||||||
* as published by the Free Software Foundation; version 2
|
* as published by the Free Software Foundation; version 2
|
||||||
* of the License.
|
* of the License.
|
||||||
*/
|
*/
|
||||||
|
#define pr_fmt(fmt) "rodata_test: " fmt
|
||||||
|
|
||||||
#include <linux/uaccess.h>
|
#include <linux/uaccess.h>
|
||||||
#include <asm/sections.h>
|
#include <asm/sections.h>
|
||||||
|
|
||||||
const int rodata_test_data = 0xC3;
|
const int rodata_test_data = 0xC3;
|
||||||
EXPORT_SYMBOL_GPL(rodata_test_data);
|
|
||||||
|
|
||||||
void rodata_test(void)
|
void rodata_test(void)
|
||||||
{
|
{
|
||||||
|
@ -23,20 +24,20 @@ void rodata_test(void)
|
||||||
/* test 1: read the value */
|
/* test 1: read the value */
|
||||||
/* If this test fails, some previous testrun has clobbered the state */
|
/* If this test fails, some previous testrun has clobbered the state */
|
||||||
if (!rodata_test_data) {
|
if (!rodata_test_data) {
|
||||||
pr_err("rodata_test: test 1 fails (start data)\n");
|
pr_err("test 1 fails (start data)\n");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* test 2: write to the variable; this should fault */
|
/* test 2: write to the variable; this should fault */
|
||||||
if (!probe_kernel_write((void *)&rodata_test_data,
|
if (!probe_kernel_write((void *)&rodata_test_data,
|
||||||
(void *)&zero, sizeof(zero))) {
|
(void *)&zero, sizeof(zero))) {
|
||||||
pr_err("rodata_test: test data was not read only\n");
|
pr_err("test data was not read only\n");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* test 3: check the value hasn't changed */
|
/* test 3: check the value hasn't changed */
|
||||||
if (rodata_test_data == zero) {
|
if (rodata_test_data == zero) {
|
||||||
pr_err("rodata_test: test data was changed\n");
|
pr_err("test data was changed\n");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -44,13 +45,13 @@ void rodata_test(void)
|
||||||
start = (unsigned long)__start_rodata;
|
start = (unsigned long)__start_rodata;
|
||||||
end = (unsigned long)__end_rodata;
|
end = (unsigned long)__end_rodata;
|
||||||
if (start & (PAGE_SIZE - 1)) {
|
if (start & (PAGE_SIZE - 1)) {
|
||||||
pr_err("rodata_test: start of .rodata is not page size aligned\n");
|
pr_err("start of .rodata is not page size aligned\n");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (end & (PAGE_SIZE - 1)) {
|
if (end & (PAGE_SIZE - 1)) {
|
||||||
pr_err("rodata_test: end of .rodata is not page size aligned\n");
|
pr_err("end of .rodata is not page size aligned\n");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
pr_info("rodata_test: all tests were successful\n");
|
pr_info("all tests were successful\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -3879,7 +3879,12 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
|
||||||
|
|
||||||
prev = cachep->cpu_cache;
|
prev = cachep->cpu_cache;
|
||||||
cachep->cpu_cache = cpu_cache;
|
cachep->cpu_cache = cpu_cache;
|
||||||
kick_all_cpus_sync();
|
/*
|
||||||
|
* Without a previous cpu_cache there's no need to synchronize remote
|
||||||
|
* cpus, so skip the IPIs.
|
||||||
|
*/
|
||||||
|
if (prev)
|
||||||
|
kick_all_cpus_sync();
|
||||||
|
|
||||||
check_irq_on();
|
check_irq_on();
|
||||||
cachep->batchcount = batchcount;
|
cachep->batchcount = batchcount;
|
||||||
|
|
|
@ -248,10 +248,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms,
|
||||||
|
|
||||||
unsigned long usemap_size(void)
|
unsigned long usemap_size(void)
|
||||||
{
|
{
|
||||||
unsigned long size_bytes;
|
return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
|
||||||
size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8;
|
|
||||||
size_bytes = roundup(size_bytes, sizeof(unsigned long));
|
|
||||||
return size_bytes;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||||
|
|
49
mm/swap.c
49
mm/swap.c
|
@ -46,7 +46,7 @@ int page_cluster;
|
||||||
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
|
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
|
||||||
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
|
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
|
||||||
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
|
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
|
||||||
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
|
static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
|
||||||
#ifdef CONFIG_SMP
|
#ifdef CONFIG_SMP
|
||||||
static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
|
static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
|
||||||
#endif
|
#endif
|
||||||
|
@ -571,20 +571,27 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
|
static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
|
||||||
void *arg)
|
void *arg)
|
||||||
{
|
{
|
||||||
if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
|
if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
|
||||||
int file = page_is_file_cache(page);
|
!PageUnevictable(page)) {
|
||||||
int lru = page_lru_base_type(page);
|
bool active = PageActive(page);
|
||||||
|
|
||||||
del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
|
del_page_from_lru_list(page, lruvec,
|
||||||
|
LRU_INACTIVE_ANON + active);
|
||||||
ClearPageActive(page);
|
ClearPageActive(page);
|
||||||
ClearPageReferenced(page);
|
ClearPageReferenced(page);
|
||||||
add_page_to_lru_list(page, lruvec, lru);
|
/*
|
||||||
|
* lazyfree pages are clean anonymous pages. They have
|
||||||
|
* SwapBacked flag cleared to distinguish normal anonymous
|
||||||
|
* pages
|
||||||
|
*/
|
||||||
|
ClearPageSwapBacked(page);
|
||||||
|
add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
|
||||||
|
|
||||||
__count_vm_event(PGDEACTIVATE);
|
__count_vm_events(PGLAZYFREE, hpage_nr_pages(page));
|
||||||
update_page_reclaim_stat(lruvec, file, 0);
|
update_page_reclaim_stat(lruvec, 1, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -614,9 +621,9 @@ void lru_add_drain_cpu(int cpu)
|
||||||
if (pagevec_count(pvec))
|
if (pagevec_count(pvec))
|
||||||
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
|
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
|
||||||
|
|
||||||
pvec = &per_cpu(lru_deactivate_pvecs, cpu);
|
pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
|
||||||
if (pagevec_count(pvec))
|
if (pagevec_count(pvec))
|
||||||
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
|
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
|
||||||
|
|
||||||
activate_page_drain(cpu);
|
activate_page_drain(cpu);
|
||||||
}
|
}
|
||||||
|
@ -648,22 +655,22 @@ void deactivate_file_page(struct page *page)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* deactivate_page - deactivate a page
|
* mark_page_lazyfree - make an anon page lazyfree
|
||||||
* @page: page to deactivate
|
* @page: page to deactivate
|
||||||
*
|
*
|
||||||
* deactivate_page() moves @page to the inactive list if @page was on the active
|
* mark_page_lazyfree() moves @page to the inactive file list.
|
||||||
* list and was not an unevictable page. This is done to accelerate the reclaim
|
* This is done to accelerate the reclaim of @page.
|
||||||
* of @page.
|
|
||||||
*/
|
*/
|
||||||
void deactivate_page(struct page *page)
|
void mark_page_lazyfree(struct page *page)
|
||||||
{
|
{
|
||||||
if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
|
if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
|
||||||
struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
|
!PageUnevictable(page)) {
|
||||||
|
struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
|
||||||
|
|
||||||
get_page(page);
|
get_page(page);
|
||||||
if (!pagevec_add(pvec, page) || PageCompound(page))
|
if (!pagevec_add(pvec, page) || PageCompound(page))
|
||||||
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
|
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
|
||||||
put_cpu_var(lru_deactivate_pvecs);
|
put_cpu_var(lru_lazyfree_pvecs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -703,7 +710,7 @@ void lru_add_drain_all(void)
|
||||||
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
|
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
|
||||||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
|
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
|
||||||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
|
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
|
||||||
pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
|
pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
|
||||||
need_activate_page_drain(cpu)) {
|
need_activate_page_drain(cpu)) {
|
||||||
INIT_WORK(work, lru_add_drain_per_cpu);
|
INIT_WORK(work, lru_add_drain_per_cpu);
|
||||||
queue_work_on(cpu, mm_percpu_wq, work);
|
queue_work_on(cpu, mm_percpu_wq, work);
|
||||||
|
|
|
@ -241,8 +241,10 @@ int enable_swap_slots_cache(void)
|
||||||
|
|
||||||
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
|
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
|
||||||
alloc_swap_slot_cache, free_slot_cache);
|
alloc_swap_slot_cache, free_slot_cache);
|
||||||
if (ret < 0)
|
if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
|
||||||
|
"without swap slots cache.\n", __func__))
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
|
|
||||||
swap_slot_cache_initialized = true;
|
swap_slot_cache_initialized = true;
|
||||||
__reenable_swap_slots_cache();
|
__reenable_swap_slots_cache();
|
||||||
out_unlock:
|
out_unlock:
|
||||||
|
|
|
@ -360,17 +360,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||||
/*
|
/*
|
||||||
* We might race against get_swap_page() and stumble
|
* We might race against get_swap_page() and stumble
|
||||||
* across a SWAP_HAS_CACHE swap_map entry whose page
|
* across a SWAP_HAS_CACHE swap_map entry whose page
|
||||||
* has not been brought into the swapcache yet, while
|
* has not been brought into the swapcache yet.
|
||||||
* the other end is scheduled away waiting on discard
|
|
||||||
* I/O completion at scan_swap_map().
|
|
||||||
*
|
|
||||||
* In order to avoid turning this transitory state
|
|
||||||
* into a permanent loop around this -EEXIST case
|
|
||||||
* if !CONFIG_PREEMPT and the I/O completion happens
|
|
||||||
* to be waiting on the CPU waitqueue where we are now
|
|
||||||
* busy looping, we just conditionally invoke the
|
|
||||||
* scheduler here, if there are some more important
|
|
||||||
* tasks to run.
|
|
||||||
*/
|
*/
|
||||||
cond_resched();
|
cond_resched();
|
||||||
continue;
|
continue;
|
||||||
|
|
|
@ -335,7 +335,7 @@ static void cluster_list_add_tail(struct swap_cluster_list *list,
|
||||||
ci_tail = ci + tail;
|
ci_tail = ci + tail;
|
||||||
spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
|
spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
|
||||||
cluster_set_next(ci_tail, idx);
|
cluster_set_next(ci_tail, idx);
|
||||||
unlock_cluster(ci_tail);
|
spin_unlock(&ci_tail->lock);
|
||||||
cluster_set_next_flag(&list->tail, idx, 0);
|
cluster_set_next_flag(&list->tail, idx, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -672,6 +672,9 @@ checks:
|
||||||
else
|
else
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
|
si->swap_map[offset] = usage;
|
||||||
|
inc_cluster_info_page(si, si->cluster_info, offset);
|
||||||
|
unlock_cluster(ci);
|
||||||
|
|
||||||
if (offset == si->lowest_bit)
|
if (offset == si->lowest_bit)
|
||||||
si->lowest_bit++;
|
si->lowest_bit++;
|
||||||
|
@ -685,9 +688,6 @@ checks:
|
||||||
plist_del(&si->avail_list, &swap_avail_head);
|
plist_del(&si->avail_list, &swap_avail_head);
|
||||||
spin_unlock(&swap_avail_lock);
|
spin_unlock(&swap_avail_lock);
|
||||||
}
|
}
|
||||||
si->swap_map[offset] = usage;
|
|
||||||
inc_cluster_info_page(si, si->cluster_info, offset);
|
|
||||||
unlock_cluster(ci);
|
|
||||||
si->cluster_next = offset + 1;
|
si->cluster_next = offset + 1;
|
||||||
slots[n_ret++] = swp_entry(si->type, offset);
|
slots[n_ret++] = swp_entry(si->type, offset);
|
||||||
|
|
||||||
|
@ -1079,8 +1079,6 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
|
||||||
p = swap_info_get_cont(entries[i], prev);
|
p = swap_info_get_cont(entries[i], prev);
|
||||||
if (p)
|
if (p)
|
||||||
swap_entry_free(p, entries[i]);
|
swap_entry_free(p, entries[i]);
|
||||||
else
|
|
||||||
break;
|
|
||||||
prev = p;
|
prev = p;
|
||||||
}
|
}
|
||||||
if (p)
|
if (p)
|
||||||
|
@ -1111,6 +1109,18 @@ int page_swapcount(struct page *page)
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
|
||||||
|
{
|
||||||
|
int count = 0;
|
||||||
|
pgoff_t offset = swp_offset(entry);
|
||||||
|
struct swap_cluster_info *ci;
|
||||||
|
|
||||||
|
ci = lock_cluster_or_swap_info(si, offset);
|
||||||
|
count = swap_count(si->swap_map[offset]);
|
||||||
|
unlock_cluster_or_swap_info(si, ci);
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* How many references to @entry are currently swapped out?
|
* How many references to @entry are currently swapped out?
|
||||||
* This does not give an exact answer when swap count is continued,
|
* This does not give an exact answer when swap count is continued,
|
||||||
|
@ -1119,17 +1129,11 @@ int page_swapcount(struct page *page)
|
||||||
int __swp_swapcount(swp_entry_t entry)
|
int __swp_swapcount(swp_entry_t entry)
|
||||||
{
|
{
|
||||||
int count = 0;
|
int count = 0;
|
||||||
pgoff_t offset;
|
|
||||||
struct swap_info_struct *si;
|
struct swap_info_struct *si;
|
||||||
struct swap_cluster_info *ci;
|
|
||||||
|
|
||||||
si = __swap_info_get(entry);
|
si = __swap_info_get(entry);
|
||||||
if (si) {
|
if (si)
|
||||||
offset = swp_offset(entry);
|
count = swap_swapcount(si, entry);
|
||||||
ci = lock_cluster_or_swap_info(si, offset);
|
|
||||||
count = swap_count(si->swap_map[offset]);
|
|
||||||
unlock_cluster_or_swap_info(si, ci);
|
|
||||||
}
|
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1291,7 +1295,8 @@ int free_swap_and_cache(swp_entry_t entry)
|
||||||
* Also recheck PageSwapCache now page is locked (above).
|
* Also recheck PageSwapCache now page is locked (above).
|
||||||
*/
|
*/
|
||||||
if (PageSwapCache(page) && !PageWriteback(page) &&
|
if (PageSwapCache(page) && !PageWriteback(page) &&
|
||||||
(!page_mapped(page) || mem_cgroup_swap_full(page))) {
|
(!page_mapped(page) || mem_cgroup_swap_full(page)) &&
|
||||||
|
!swap_swapcount(p, entry)) {
|
||||||
delete_from_swap_cache(page);
|
delete_from_swap_cache(page);
|
||||||
SetPageDirty(page);
|
SetPageDirty(page);
|
||||||
}
|
}
|
||||||
|
|
|
@ -266,9 +266,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
|
||||||
pgoff_t index;
|
pgoff_t index;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
cleancache_invalidate_inode(mapping);
|
|
||||||
if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
|
if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
|
||||||
return;
|
goto out;
|
||||||
|
|
||||||
/* Offsets within partial pages */
|
/* Offsets within partial pages */
|
||||||
partial_start = lstart & (PAGE_SIZE - 1);
|
partial_start = lstart & (PAGE_SIZE - 1);
|
||||||
|
@ -363,7 +362,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
|
||||||
* will be released, just zeroed, so we can bail out now.
|
* will be released, just zeroed, so we can bail out now.
|
||||||
*/
|
*/
|
||||||
if (start >= end)
|
if (start >= end)
|
||||||
return;
|
goto out;
|
||||||
|
|
||||||
index = start;
|
index = start;
|
||||||
for ( ; ; ) {
|
for ( ; ; ) {
|
||||||
|
@ -410,6 +409,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
|
||||||
pagevec_release(&pvec);
|
pagevec_release(&pvec);
|
||||||
index++;
|
index++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
out:
|
||||||
cleancache_invalidate_inode(mapping);
|
cleancache_invalidate_inode(mapping);
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(truncate_inode_pages_range);
|
EXPORT_SYMBOL(truncate_inode_pages_range);
|
||||||
|
@ -623,7 +624,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
|
||||||
int ret2 = 0;
|
int ret2 = 0;
|
||||||
int did_range_unmap = 0;
|
int did_range_unmap = 0;
|
||||||
|
|
||||||
cleancache_invalidate_inode(mapping);
|
if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
|
||||||
|
goto out;
|
||||||
|
|
||||||
pagevec_init(&pvec, 0);
|
pagevec_init(&pvec, 0);
|
||||||
index = start;
|
index = start;
|
||||||
while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
|
while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
|
||||||
|
@ -686,6 +689,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
|
||||||
cond_resched();
|
cond_resched();
|
||||||
index++;
|
index++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
out:
|
||||||
cleancache_invalidate_inode(mapping);
|
cleancache_invalidate_inode(mapping);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
508
mm/vmscan.c
508
mm/vmscan.c
|
@ -97,8 +97,13 @@ struct scan_control {
|
||||||
/* Can pages be swapped as part of reclaim? */
|
/* Can pages be swapped as part of reclaim? */
|
||||||
unsigned int may_swap:1;
|
unsigned int may_swap:1;
|
||||||
|
|
||||||
/* Can cgroups be reclaimed below their normal consumption range? */
|
/*
|
||||||
unsigned int may_thrash:1;
|
* Cgroups are not reclaimed below their configured memory.low,
|
||||||
|
* unless we threaten to OOM. If any cgroups are skipped due to
|
||||||
|
* memory.low and nothing was reclaimed, go back for memory.low.
|
||||||
|
*/
|
||||||
|
unsigned int memcg_low_reclaim:1;
|
||||||
|
unsigned int memcg_low_skipped:1;
|
||||||
|
|
||||||
unsigned int hibernation_mode:1;
|
unsigned int hibernation_mode:1;
|
||||||
|
|
||||||
|
@ -230,12 +235,6 @@ unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
|
||||||
return nr;
|
return nr;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool pgdat_reclaimable(struct pglist_data *pgdat)
|
|
||||||
{
|
|
||||||
return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) <
|
|
||||||
pgdat_reclaimable_pages(pgdat) * 6;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* lruvec_lru_size - Returns the number of pages on the given LRU list.
|
* lruvec_lru_size - Returns the number of pages on the given LRU list.
|
||||||
* @lruvec: lru vector
|
* @lruvec: lru vector
|
||||||
|
@ -912,7 +911,8 @@ static void page_check_dirty_writeback(struct page *page,
|
||||||
* Anonymous pages are not handled by flushers and must be written
|
* Anonymous pages are not handled by flushers and must be written
|
||||||
* from reclaim context. Do not stall reclaim based on them
|
* from reclaim context. Do not stall reclaim based on them
|
||||||
*/
|
*/
|
||||||
if (!page_is_file_cache(page)) {
|
if (!page_is_file_cache(page) ||
|
||||||
|
(PageAnon(page) && !PageSwapBacked(page))) {
|
||||||
*dirty = false;
|
*dirty = false;
|
||||||
*writeback = false;
|
*writeback = false;
|
||||||
return;
|
return;
|
||||||
|
@ -972,8 +972,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
|
||||||
int may_enter_fs;
|
int may_enter_fs;
|
||||||
enum page_references references = PAGEREF_RECLAIM_CLEAN;
|
enum page_references references = PAGEREF_RECLAIM_CLEAN;
|
||||||
bool dirty, writeback;
|
bool dirty, writeback;
|
||||||
bool lazyfree = false;
|
|
||||||
int ret = SWAP_SUCCESS;
|
|
||||||
|
|
||||||
cond_resched();
|
cond_resched();
|
||||||
|
|
||||||
|
@ -988,13 +986,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
|
||||||
sc->nr_scanned++;
|
sc->nr_scanned++;
|
||||||
|
|
||||||
if (unlikely(!page_evictable(page)))
|
if (unlikely(!page_evictable(page)))
|
||||||
goto cull_mlocked;
|
goto activate_locked;
|
||||||
|
|
||||||
if (!sc->may_unmap && page_mapped(page))
|
if (!sc->may_unmap && page_mapped(page))
|
||||||
goto keep_locked;
|
goto keep_locked;
|
||||||
|
|
||||||
/* Double the slab pressure for mapped and swapcache pages */
|
/* Double the slab pressure for mapped and swapcache pages */
|
||||||
if (page_mapped(page) || PageSwapCache(page))
|
if ((page_mapped(page) || PageSwapCache(page)) &&
|
||||||
|
!(PageAnon(page) && !PageSwapBacked(page)))
|
||||||
sc->nr_scanned++;
|
sc->nr_scanned++;
|
||||||
|
|
||||||
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
|
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
|
||||||
|
@ -1120,13 +1119,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
|
||||||
/*
|
/*
|
||||||
* Anonymous process memory has backing store?
|
* Anonymous process memory has backing store?
|
||||||
* Try to allocate it some swap space here.
|
* Try to allocate it some swap space here.
|
||||||
|
* Lazyfree page could be freed directly
|
||||||
*/
|
*/
|
||||||
if (PageAnon(page) && !PageSwapCache(page)) {
|
if (PageAnon(page) && PageSwapBacked(page) &&
|
||||||
|
!PageSwapCache(page)) {
|
||||||
if (!(sc->gfp_mask & __GFP_IO))
|
if (!(sc->gfp_mask & __GFP_IO))
|
||||||
goto keep_locked;
|
goto keep_locked;
|
||||||
if (!add_to_swap(page, page_list))
|
if (!add_to_swap(page, page_list))
|
||||||
goto activate_locked;
|
goto activate_locked;
|
||||||
lazyfree = true;
|
|
||||||
may_enter_fs = 1;
|
may_enter_fs = 1;
|
||||||
|
|
||||||
/* Adding to swap updated mapping */
|
/* Adding to swap updated mapping */
|
||||||
|
@ -1143,21 +1143,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
|
||||||
* The page is mapped into the page tables of one or more
|
* The page is mapped into the page tables of one or more
|
||||||
* processes. Try to unmap it here.
|
* processes. Try to unmap it here.
|
||||||
*/
|
*/
|
||||||
if (page_mapped(page) && mapping) {
|
if (page_mapped(page)) {
|
||||||
switch (ret = try_to_unmap(page, lazyfree ?
|
if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) {
|
||||||
(ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
|
|
||||||
(ttu_flags | TTU_BATCH_FLUSH))) {
|
|
||||||
case SWAP_FAIL:
|
|
||||||
nr_unmap_fail++;
|
nr_unmap_fail++;
|
||||||
goto activate_locked;
|
goto activate_locked;
|
||||||
case SWAP_AGAIN:
|
|
||||||
goto keep_locked;
|
|
||||||
case SWAP_MLOCK:
|
|
||||||
goto cull_mlocked;
|
|
||||||
case SWAP_LZFREE:
|
|
||||||
goto lazyfree;
|
|
||||||
case SWAP_SUCCESS:
|
|
||||||
; /* try to free the page below */
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1267,10 +1256,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
lazyfree:
|
if (PageAnon(page) && !PageSwapBacked(page)) {
|
||||||
if (!mapping || !__remove_mapping(mapping, page, true))
|
/* follow __remove_mapping for reference */
|
||||||
goto keep_locked;
|
if (!page_ref_freeze(page, 1))
|
||||||
|
goto keep_locked;
|
||||||
|
if (PageDirty(page)) {
|
||||||
|
page_ref_unfreeze(page, 1);
|
||||||
|
goto keep_locked;
|
||||||
|
}
|
||||||
|
|
||||||
|
count_vm_event(PGLAZYFREED);
|
||||||
|
} else if (!mapping || !__remove_mapping(mapping, page, true))
|
||||||
|
goto keep_locked;
|
||||||
/*
|
/*
|
||||||
* At this point, we have no other references and there is
|
* At this point, we have no other references and there is
|
||||||
* no way to pick any more up (removed from LRU, removed
|
* no way to pick any more up (removed from LRU, removed
|
||||||
|
@ -1280,9 +1277,6 @@ lazyfree:
|
||||||
*/
|
*/
|
||||||
__ClearPageLocked(page);
|
__ClearPageLocked(page);
|
||||||
free_it:
|
free_it:
|
||||||
if (ret == SWAP_LZFREE)
|
|
||||||
count_vm_event(PGLAZYFREED);
|
|
||||||
|
|
||||||
nr_reclaimed++;
|
nr_reclaimed++;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1292,20 +1286,16 @@ free_it:
|
||||||
list_add(&page->lru, &free_pages);
|
list_add(&page->lru, &free_pages);
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
cull_mlocked:
|
|
||||||
if (PageSwapCache(page))
|
|
||||||
try_to_free_swap(page);
|
|
||||||
unlock_page(page);
|
|
||||||
list_add(&page->lru, &ret_pages);
|
|
||||||
continue;
|
|
||||||
|
|
||||||
activate_locked:
|
activate_locked:
|
||||||
/* Not a candidate for swapping, so reclaim swap space. */
|
/* Not a candidate for swapping, so reclaim swap space. */
|
||||||
if (PageSwapCache(page) && mem_cgroup_swap_full(page))
|
if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
|
||||||
|
PageMlocked(page)))
|
||||||
try_to_free_swap(page);
|
try_to_free_swap(page);
|
||||||
VM_BUG_ON_PAGE(PageActive(page), page);
|
VM_BUG_ON_PAGE(PageActive(page), page);
|
||||||
SetPageActive(page);
|
if (!PageMlocked(page)) {
|
||||||
pgactivate++;
|
SetPageActive(page);
|
||||||
|
pgactivate++;
|
||||||
|
}
|
||||||
keep_locked:
|
keep_locked:
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
keep:
|
keep:
|
||||||
|
@ -1354,7 +1344,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
|
ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
|
||||||
TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true);
|
TTU_IGNORE_ACCESS, NULL, true);
|
||||||
list_splice(&clean_pages, page_list);
|
list_splice(&clean_pages, page_list);
|
||||||
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
|
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -1478,12 +1468,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
|
||||||
unsigned long nr_taken = 0;
|
unsigned long nr_taken = 0;
|
||||||
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
|
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
|
||||||
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
|
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
|
||||||
unsigned long skipped = 0, total_skipped = 0;
|
unsigned long skipped = 0;
|
||||||
unsigned long scan, nr_pages;
|
unsigned long scan, nr_pages;
|
||||||
LIST_HEAD(pages_skipped);
|
LIST_HEAD(pages_skipped);
|
||||||
|
|
||||||
for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
|
for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
|
||||||
!list_empty(src);) {
|
!list_empty(src); scan++) {
|
||||||
struct page *page;
|
struct page *page;
|
||||||
|
|
||||||
page = lru_to_page(src);
|
page = lru_to_page(src);
|
||||||
|
@ -1497,12 +1487,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Account for scanned and skipped separetly to avoid the pgdat
|
|
||||||
* being prematurely marked unreclaimable by pgdat_reclaimable.
|
|
||||||
*/
|
|
||||||
scan++;
|
|
||||||
|
|
||||||
switch (__isolate_lru_page(page, mode)) {
|
switch (__isolate_lru_page(page, mode)) {
|
||||||
case 0:
|
case 0:
|
||||||
nr_pages = hpage_nr_pages(page);
|
nr_pages = hpage_nr_pages(page);
|
||||||
|
@ -1531,6 +1515,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
|
||||||
if (!list_empty(&pages_skipped)) {
|
if (!list_empty(&pages_skipped)) {
|
||||||
int zid;
|
int zid;
|
||||||
|
|
||||||
|
list_splice(&pages_skipped, src);
|
||||||
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
|
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
|
||||||
if (!nr_skipped[zid])
|
if (!nr_skipped[zid])
|
||||||
continue;
|
continue;
|
||||||
|
@ -1538,17 +1523,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
|
||||||
__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
|
__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
|
||||||
skipped += nr_skipped[zid];
|
skipped += nr_skipped[zid];
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Account skipped pages as a partial scan as the pgdat may be
|
|
||||||
* close to unreclaimable. If the LRU list is empty, account
|
|
||||||
* skipped pages as a full scan.
|
|
||||||
*/
|
|
||||||
total_skipped = list_empty(src) ? skipped : skipped >> 2;
|
|
||||||
|
|
||||||
list_splice(&pages_skipped, src);
|
|
||||||
}
|
}
|
||||||
*nr_scanned = scan + total_skipped;
|
*nr_scanned = scan;
|
||||||
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
|
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
|
||||||
scan, skipped, nr_taken, mode, lru);
|
scan, skipped, nr_taken, mode, lru);
|
||||||
update_lru_sizes(lruvec, lru, nr_zone_taken);
|
update_lru_sizes(lruvec, lru, nr_zone_taken);
|
||||||
|
@ -1750,7 +1726,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
|
||||||
reclaim_stat->recent_scanned[file] += nr_taken;
|
reclaim_stat->recent_scanned[file] += nr_taken;
|
||||||
|
|
||||||
if (global_reclaim(sc)) {
|
if (global_reclaim(sc)) {
|
||||||
__mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
|
|
||||||
if (current_is_kswapd())
|
if (current_is_kswapd())
|
||||||
__count_vm_events(PGSCAN_KSWAPD, nr_scanned);
|
__count_vm_events(PGSCAN_KSWAPD, nr_scanned);
|
||||||
else
|
else
|
||||||
|
@ -1761,7 +1736,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
|
||||||
if (nr_taken == 0)
|
if (nr_taken == 0)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
|
nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
|
||||||
&stat, false);
|
&stat, false);
|
||||||
|
|
||||||
spin_lock_irq(&pgdat->lru_lock);
|
spin_lock_irq(&pgdat->lru_lock);
|
||||||
|
@ -1953,8 +1928,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
|
||||||
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
|
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
|
||||||
reclaim_stat->recent_scanned[file] += nr_taken;
|
reclaim_stat->recent_scanned[file] += nr_taken;
|
||||||
|
|
||||||
if (global_reclaim(sc))
|
|
||||||
__mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
|
|
||||||
__count_vm_events(PGREFILL, nr_scanned);
|
__count_vm_events(PGREFILL, nr_scanned);
|
||||||
|
|
||||||
spin_unlock_irq(&pgdat->lru_lock);
|
spin_unlock_irq(&pgdat->lru_lock);
|
||||||
|
@ -2033,6 +2006,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
|
||||||
* Both inactive lists should also be large enough that each inactive
|
* Both inactive lists should also be large enough that each inactive
|
||||||
* page has a chance to be referenced again before it is reclaimed.
|
* page has a chance to be referenced again before it is reclaimed.
|
||||||
*
|
*
|
||||||
|
* If that fails and refaulting is observed, the inactive list grows.
|
||||||
|
*
|
||||||
* The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
|
* The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
|
||||||
* on this LRU, maintained by the pageout code. A zone->inactive_ratio
|
* on this LRU, maintained by the pageout code. A zone->inactive_ratio
|
||||||
* of 3 means 3:1 or 25% of the pages are kept on the inactive list.
|
* of 3 means 3:1 or 25% of the pages are kept on the inactive list.
|
||||||
|
@ -2049,12 +2024,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
|
||||||
* 10TB 320 32GB
|
* 10TB 320 32GB
|
||||||
*/
|
*/
|
||||||
static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
|
static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
|
||||||
struct scan_control *sc, bool trace)
|
struct mem_cgroup *memcg,
|
||||||
|
struct scan_control *sc, bool actual_reclaim)
|
||||||
{
|
{
|
||||||
unsigned long inactive_ratio;
|
|
||||||
unsigned long inactive, active;
|
|
||||||
enum lru_list inactive_lru = file * LRU_FILE;
|
|
||||||
enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
|
enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
|
||||||
|
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||||
|
enum lru_list inactive_lru = file * LRU_FILE;
|
||||||
|
unsigned long inactive, active;
|
||||||
|
unsigned long inactive_ratio;
|
||||||
|
unsigned long refaults;
|
||||||
unsigned long gb;
|
unsigned long gb;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -2067,27 +2045,42 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
|
||||||
inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
|
inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
|
||||||
active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
|
active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
|
||||||
|
|
||||||
gb = (inactive + active) >> (30 - PAGE_SHIFT);
|
if (memcg)
|
||||||
if (gb)
|
refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
|
||||||
inactive_ratio = int_sqrt(10 * gb);
|
|
||||||
else
|
else
|
||||||
inactive_ratio = 1;
|
refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
|
||||||
|
|
||||||
if (trace)
|
/*
|
||||||
trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id,
|
* When refaults are being observed, it means a new workingset
|
||||||
sc->reclaim_idx,
|
* is being established. Disable active list protection to get
|
||||||
lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
|
* rid of the stale workingset quickly.
|
||||||
lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
|
*/
|
||||||
inactive_ratio, file);
|
if (file && actual_reclaim && lruvec->refaults != refaults) {
|
||||||
|
inactive_ratio = 0;
|
||||||
|
} else {
|
||||||
|
gb = (inactive + active) >> (30 - PAGE_SHIFT);
|
||||||
|
if (gb)
|
||||||
|
inactive_ratio = int_sqrt(10 * gb);
|
||||||
|
else
|
||||||
|
inactive_ratio = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (actual_reclaim)
|
||||||
|
trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
|
||||||
|
lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
|
||||||
|
lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
|
||||||
|
inactive_ratio, file);
|
||||||
|
|
||||||
return inactive * inactive_ratio < active;
|
return inactive * inactive_ratio < active;
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
|
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
|
||||||
struct lruvec *lruvec, struct scan_control *sc)
|
struct lruvec *lruvec, struct mem_cgroup *memcg,
|
||||||
|
struct scan_control *sc)
|
||||||
{
|
{
|
||||||
if (is_active_lru(lru)) {
|
if (is_active_lru(lru)) {
|
||||||
if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
|
if (inactive_list_is_low(lruvec, is_file_lru(lru),
|
||||||
|
memcg, sc, true))
|
||||||
shrink_active_list(nr_to_scan, lruvec, sc, lru);
|
shrink_active_list(nr_to_scan, lruvec, sc, lru);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -2123,30 +2116,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
|
||||||
unsigned long anon_prio, file_prio;
|
unsigned long anon_prio, file_prio;
|
||||||
enum scan_balance scan_balance;
|
enum scan_balance scan_balance;
|
||||||
unsigned long anon, file;
|
unsigned long anon, file;
|
||||||
bool force_scan = false;
|
|
||||||
unsigned long ap, fp;
|
unsigned long ap, fp;
|
||||||
enum lru_list lru;
|
enum lru_list lru;
|
||||||
bool some_scanned;
|
|
||||||
int pass;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If the zone or memcg is small, nr[l] can be 0. This
|
|
||||||
* results in no scanning on this priority and a potential
|
|
||||||
* priority drop. Global direct reclaim can go to the next
|
|
||||||
* zone and tends to have no problems. Global kswapd is for
|
|
||||||
* zone balancing and it needs to scan a minimum amount. When
|
|
||||||
* reclaiming for a memcg, a priority drop can cause high
|
|
||||||
* latencies, so it's better to scan a minimum amount there as
|
|
||||||
* well.
|
|
||||||
*/
|
|
||||||
if (current_is_kswapd()) {
|
|
||||||
if (!pgdat_reclaimable(pgdat))
|
|
||||||
force_scan = true;
|
|
||||||
if (!mem_cgroup_online(memcg))
|
|
||||||
force_scan = true;
|
|
||||||
}
|
|
||||||
if (!global_reclaim(sc))
|
|
||||||
force_scan = true;
|
|
||||||
|
|
||||||
/* If we have no swap space, do not bother scanning anon pages. */
|
/* If we have no swap space, do not bother scanning anon pages. */
|
||||||
if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
|
if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
|
||||||
|
@ -2218,7 +2189,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
|
||||||
* lruvec even if it has plenty of old anonymous pages unless the
|
* lruvec even if it has plenty of old anonymous pages unless the
|
||||||
* system is under heavy pressure.
|
* system is under heavy pressure.
|
||||||
*/
|
*/
|
||||||
if (!inactive_list_is_low(lruvec, true, sc, false) &&
|
if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
|
||||||
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
|
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
|
||||||
scan_balance = SCAN_FILE;
|
scan_balance = SCAN_FILE;
|
||||||
goto out;
|
goto out;
|
||||||
|
@ -2277,55 +2248,48 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
|
||||||
fraction[1] = fp;
|
fraction[1] = fp;
|
||||||
denominator = ap + fp + 1;
|
denominator = ap + fp + 1;
|
||||||
out:
|
out:
|
||||||
some_scanned = false;
|
*lru_pages = 0;
|
||||||
/* Only use force_scan on second pass. */
|
for_each_evictable_lru(lru) {
|
||||||
for (pass = 0; !some_scanned && pass < 2; pass++) {
|
int file = is_file_lru(lru);
|
||||||
*lru_pages = 0;
|
unsigned long size;
|
||||||
for_each_evictable_lru(lru) {
|
unsigned long scan;
|
||||||
int file = is_file_lru(lru);
|
|
||||||
unsigned long size;
|
|
||||||
unsigned long scan;
|
|
||||||
|
|
||||||
size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
|
size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
|
||||||
scan = size >> sc->priority;
|
scan = size >> sc->priority;
|
||||||
|
/*
|
||||||
if (!scan && pass && force_scan)
|
* If the cgroup's already been deleted, make sure to
|
||||||
scan = min(size, SWAP_CLUSTER_MAX);
|
* scrape out the remaining cache.
|
||||||
|
*/
|
||||||
switch (scan_balance) {
|
if (!scan && !mem_cgroup_online(memcg))
|
||||||
case SCAN_EQUAL:
|
scan = min(size, SWAP_CLUSTER_MAX);
|
||||||
/* Scan lists relative to size */
|
|
||||||
break;
|
|
||||||
case SCAN_FRACT:
|
|
||||||
/*
|
|
||||||
* Scan types proportional to swappiness and
|
|
||||||
* their relative recent reclaim efficiency.
|
|
||||||
*/
|
|
||||||
scan = div64_u64(scan * fraction[file],
|
|
||||||
denominator);
|
|
||||||
break;
|
|
||||||
case SCAN_FILE:
|
|
||||||
case SCAN_ANON:
|
|
||||||
/* Scan one type exclusively */
|
|
||||||
if ((scan_balance == SCAN_FILE) != file) {
|
|
||||||
size = 0;
|
|
||||||
scan = 0;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
/* Look ma, no brain */
|
|
||||||
BUG();
|
|
||||||
}
|
|
||||||
|
|
||||||
*lru_pages += size;
|
|
||||||
nr[lru] = scan;
|
|
||||||
|
|
||||||
|
switch (scan_balance) {
|
||||||
|
case SCAN_EQUAL:
|
||||||
|
/* Scan lists relative to size */
|
||||||
|
break;
|
||||||
|
case SCAN_FRACT:
|
||||||
/*
|
/*
|
||||||
* Skip the second pass and don't force_scan,
|
* Scan types proportional to swappiness and
|
||||||
* if we found something to scan.
|
* their relative recent reclaim efficiency.
|
||||||
*/
|
*/
|
||||||
some_scanned |= !!scan;
|
scan = div64_u64(scan * fraction[file],
|
||||||
|
denominator);
|
||||||
|
break;
|
||||||
|
case SCAN_FILE:
|
||||||
|
case SCAN_ANON:
|
||||||
|
/* Scan one type exclusively */
|
||||||
|
if ((scan_balance == SCAN_FILE) != file) {
|
||||||
|
size = 0;
|
||||||
|
scan = 0;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
/* Look ma, no brain */
|
||||||
|
BUG();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
*lru_pages += size;
|
||||||
|
nr[lru] = scan;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2376,7 +2340,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
|
||||||
nr[lru] -= nr_to_scan;
|
nr[lru] -= nr_to_scan;
|
||||||
|
|
||||||
nr_reclaimed += shrink_list(lru, nr_to_scan,
|
nr_reclaimed += shrink_list(lru, nr_to_scan,
|
||||||
lruvec, sc);
|
lruvec, memcg, sc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2443,7 +2407,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
|
||||||
* Even if we did not try to evict anon pages at all, we want to
|
* Even if we did not try to evict anon pages at all, we want to
|
||||||
* rebalance the anon lru active/inactive ratio.
|
* rebalance the anon lru active/inactive ratio.
|
||||||
*/
|
*/
|
||||||
if (inactive_list_is_low(lruvec, false, sc, true))
|
if (inactive_list_is_low(lruvec, false, memcg, sc, true))
|
||||||
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
|
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
|
||||||
sc, LRU_ACTIVE_ANON);
|
sc, LRU_ACTIVE_ANON);
|
||||||
}
|
}
|
||||||
|
@ -2557,9 +2521,11 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
|
||||||
unsigned long scanned;
|
unsigned long scanned;
|
||||||
|
|
||||||
if (mem_cgroup_low(root, memcg)) {
|
if (mem_cgroup_low(root, memcg)) {
|
||||||
if (!sc->may_thrash)
|
if (!sc->memcg_low_reclaim) {
|
||||||
|
sc->memcg_low_skipped = 1;
|
||||||
continue;
|
continue;
|
||||||
mem_cgroup_events(memcg, MEMCG_LOW, 1);
|
}
|
||||||
|
mem_cgroup_event(memcg, MEMCG_LOW);
|
||||||
}
|
}
|
||||||
|
|
||||||
reclaimed = sc->nr_reclaimed;
|
reclaimed = sc->nr_reclaimed;
|
||||||
|
@ -2620,6 +2586,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
|
||||||
} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
|
} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
|
||||||
sc->nr_scanned - nr_scanned, sc));
|
sc->nr_scanned - nr_scanned, sc));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Kswapd gives up on balancing particular nodes after too
|
||||||
|
* many failures to reclaim anything from them and goes to
|
||||||
|
* sleep. On reclaim progress, reset the failure counter. A
|
||||||
|
* successful direct reclaim run will revive a dormant kswapd.
|
||||||
|
*/
|
||||||
|
if (reclaimable)
|
||||||
|
pgdat->kswapd_failures = 0;
|
||||||
|
|
||||||
return reclaimable;
|
return reclaimable;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2694,10 +2669,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
|
||||||
GFP_KERNEL | __GFP_HARDWALL))
|
GFP_KERNEL | __GFP_HARDWALL))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (sc->priority != DEF_PRIORITY &&
|
|
||||||
!pgdat_reclaimable(zone->zone_pgdat))
|
|
||||||
continue; /* Let kswapd poll it */
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If we already have plenty of memory free for
|
* If we already have plenty of memory free for
|
||||||
* compaction in this zone, don't free any more.
|
* compaction in this zone, don't free any more.
|
||||||
|
@ -2752,6 +2723,25 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
|
||||||
sc->gfp_mask = orig_mask;
|
sc->gfp_mask = orig_mask;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
|
||||||
|
{
|
||||||
|
struct mem_cgroup *memcg;
|
||||||
|
|
||||||
|
memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
|
||||||
|
do {
|
||||||
|
unsigned long refaults;
|
||||||
|
struct lruvec *lruvec;
|
||||||
|
|
||||||
|
if (memcg)
|
||||||
|
refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
|
||||||
|
else
|
||||||
|
refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
|
||||||
|
|
||||||
|
lruvec = mem_cgroup_lruvec(pgdat, memcg);
|
||||||
|
lruvec->refaults = refaults;
|
||||||
|
} while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This is the main entry point to direct page reclaim.
|
* This is the main entry point to direct page reclaim.
|
||||||
*
|
*
|
||||||
|
@ -2772,6 +2762,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
|
||||||
struct scan_control *sc)
|
struct scan_control *sc)
|
||||||
{
|
{
|
||||||
int initial_priority = sc->priority;
|
int initial_priority = sc->priority;
|
||||||
|
pg_data_t *last_pgdat;
|
||||||
|
struct zoneref *z;
|
||||||
|
struct zone *zone;
|
||||||
retry:
|
retry:
|
||||||
delayacct_freepages_start();
|
delayacct_freepages_start();
|
||||||
|
|
||||||
|
@ -2798,6 +2791,15 @@ retry:
|
||||||
sc->may_writepage = 1;
|
sc->may_writepage = 1;
|
||||||
} while (--sc->priority >= 0);
|
} while (--sc->priority >= 0);
|
||||||
|
|
||||||
|
last_pgdat = NULL;
|
||||||
|
for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
|
||||||
|
sc->nodemask) {
|
||||||
|
if (zone->zone_pgdat == last_pgdat)
|
||||||
|
continue;
|
||||||
|
last_pgdat = zone->zone_pgdat;
|
||||||
|
snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
|
||||||
|
}
|
||||||
|
|
||||||
delayacct_freepages_end();
|
delayacct_freepages_end();
|
||||||
|
|
||||||
if (sc->nr_reclaimed)
|
if (sc->nr_reclaimed)
|
||||||
|
@ -2808,16 +2810,17 @@ retry:
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
/* Untapped cgroup reserves? Don't OOM, retry. */
|
/* Untapped cgroup reserves? Don't OOM, retry. */
|
||||||
if (!sc->may_thrash) {
|
if (sc->memcg_low_skipped) {
|
||||||
sc->priority = initial_priority;
|
sc->priority = initial_priority;
|
||||||
sc->may_thrash = 1;
|
sc->memcg_low_reclaim = 1;
|
||||||
|
sc->memcg_low_skipped = 0;
|
||||||
goto retry;
|
goto retry;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
|
static bool allow_direct_reclaim(pg_data_t *pgdat)
|
||||||
{
|
{
|
||||||
struct zone *zone;
|
struct zone *zone;
|
||||||
unsigned long pfmemalloc_reserve = 0;
|
unsigned long pfmemalloc_reserve = 0;
|
||||||
|
@ -2825,10 +2828,15 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
|
||||||
int i;
|
int i;
|
||||||
bool wmark_ok;
|
bool wmark_ok;
|
||||||
|
|
||||||
|
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
|
||||||
|
return true;
|
||||||
|
|
||||||
for (i = 0; i <= ZONE_NORMAL; i++) {
|
for (i = 0; i <= ZONE_NORMAL; i++) {
|
||||||
zone = &pgdat->node_zones[i];
|
zone = &pgdat->node_zones[i];
|
||||||
if (!managed_zone(zone) ||
|
if (!managed_zone(zone))
|
||||||
pgdat_reclaimable_pages(pgdat) == 0)
|
continue;
|
||||||
|
|
||||||
|
if (!zone_reclaimable_pages(zone))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
pfmemalloc_reserve += min_wmark_pages(zone);
|
pfmemalloc_reserve += min_wmark_pages(zone);
|
||||||
|
@ -2905,7 +2913,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
|
||||||
|
|
||||||
/* Throttle based on the first usable node */
|
/* Throttle based on the first usable node */
|
||||||
pgdat = zone->zone_pgdat;
|
pgdat = zone->zone_pgdat;
|
||||||
if (pfmemalloc_watermark_ok(pgdat))
|
if (allow_direct_reclaim(pgdat))
|
||||||
goto out;
|
goto out;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -2927,14 +2935,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
|
||||||
*/
|
*/
|
||||||
if (!(gfp_mask & __GFP_FS)) {
|
if (!(gfp_mask & __GFP_FS)) {
|
||||||
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
|
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
|
||||||
pfmemalloc_watermark_ok(pgdat), HZ);
|
allow_direct_reclaim(pgdat), HZ);
|
||||||
|
|
||||||
goto check_pending;
|
goto check_pending;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Throttle until kswapd wakes the process */
|
/* Throttle until kswapd wakes the process */
|
||||||
wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
|
wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
|
||||||
pfmemalloc_watermark_ok(pgdat));
|
allow_direct_reclaim(pgdat));
|
||||||
|
|
||||||
check_pending:
|
check_pending:
|
||||||
if (fatal_signal_pending(current))
|
if (fatal_signal_pending(current))
|
||||||
|
@ -2950,7 +2958,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
|
||||||
unsigned long nr_reclaimed;
|
unsigned long nr_reclaimed;
|
||||||
struct scan_control sc = {
|
struct scan_control sc = {
|
||||||
.nr_to_reclaim = SWAP_CLUSTER_MAX,
|
.nr_to_reclaim = SWAP_CLUSTER_MAX,
|
||||||
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
|
.gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
|
||||||
.reclaim_idx = gfp_zone(gfp_mask),
|
.reclaim_idx = gfp_zone(gfp_mask),
|
||||||
.order = order,
|
.order = order,
|
||||||
.nodemask = nodemask,
|
.nodemask = nodemask,
|
||||||
|
@ -3030,7 +3038,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
|
||||||
int nid;
|
int nid;
|
||||||
struct scan_control sc = {
|
struct scan_control sc = {
|
||||||
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
|
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
|
||||||
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
|
.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
|
||||||
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
|
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
|
||||||
.reclaim_idx = MAX_NR_ZONES - 1,
|
.reclaim_idx = MAX_NR_ZONES - 1,
|
||||||
.target_mem_cgroup = memcg,
|
.target_mem_cgroup = memcg,
|
||||||
|
@ -3076,7 +3084,7 @@ static void age_active_anon(struct pglist_data *pgdat,
|
||||||
do {
|
do {
|
||||||
struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
|
struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
|
||||||
|
|
||||||
if (inactive_list_is_low(lruvec, false, sc, true))
|
if (inactive_list_is_low(lruvec, false, memcg, sc, true))
|
||||||
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
|
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
|
||||||
sc, LRU_ACTIVE_ANON);
|
sc, LRU_ACTIVE_ANON);
|
||||||
|
|
||||||
|
@ -3084,22 +3092,44 @@ static void age_active_anon(struct pglist_data *pgdat,
|
||||||
} while (memcg);
|
} while (memcg);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
|
/*
|
||||||
|
* Returns true if there is an eligible zone balanced for the request order
|
||||||
|
* and classzone_idx
|
||||||
|
*/
|
||||||
|
static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
|
||||||
{
|
{
|
||||||
unsigned long mark = high_wmark_pages(zone);
|
int i;
|
||||||
|
unsigned long mark = -1;
|
||||||
|
struct zone *zone;
|
||||||
|
|
||||||
if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx))
|
for (i = 0; i <= classzone_idx; i++) {
|
||||||
return false;
|
zone = pgdat->node_zones + i;
|
||||||
|
|
||||||
|
if (!managed_zone(zone))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
mark = high_wmark_pages(zone);
|
||||||
|
if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If any eligible zone is balanced then the node is not considered
|
* If a node has no populated zone within classzone_idx, it does not
|
||||||
* to be congested or dirty
|
* need balancing by definition. This can happen if a zone-restricted
|
||||||
|
* allocation tries to wake a remote kswapd.
|
||||||
*/
|
*/
|
||||||
clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
|
if (mark == -1)
|
||||||
clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
|
return true;
|
||||||
clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags);
|
|
||||||
|
|
||||||
return true;
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Clear pgdat state for congested, dirty or under writeback. */
|
||||||
|
static void clear_pgdat_congested(pg_data_t *pgdat)
|
||||||
|
{
|
||||||
|
clear_bit(PGDAT_CONGESTED, &pgdat->flags);
|
||||||
|
clear_bit(PGDAT_DIRTY, &pgdat->flags);
|
||||||
|
clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -3110,11 +3140,9 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
|
||||||
*/
|
*/
|
||||||
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
|
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
|
||||||
{
|
{
|
||||||
int i;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The throttled processes are normally woken up in balance_pgdat() as
|
* The throttled processes are normally woken up in balance_pgdat() as
|
||||||
* soon as pfmemalloc_watermark_ok() is true. But there is a potential
|
* soon as allow_direct_reclaim() is true. But there is a potential
|
||||||
* race between when kswapd checks the watermarks and a process gets
|
* race between when kswapd checks the watermarks and a process gets
|
||||||
* throttled. There is also a potential race if processes get
|
* throttled. There is also a potential race if processes get
|
||||||
* throttled, kswapd wakes, a large process exits thereby balancing the
|
* throttled, kswapd wakes, a large process exits thereby balancing the
|
||||||
|
@ -3128,17 +3156,16 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
|
||||||
if (waitqueue_active(&pgdat->pfmemalloc_wait))
|
if (waitqueue_active(&pgdat->pfmemalloc_wait))
|
||||||
wake_up_all(&pgdat->pfmemalloc_wait);
|
wake_up_all(&pgdat->pfmemalloc_wait);
|
||||||
|
|
||||||
for (i = 0; i <= classzone_idx; i++) {
|
/* Hopeless node, leave it to direct reclaim */
|
||||||
struct zone *zone = pgdat->node_zones + i;
|
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
|
||||||
|
return true;
|
||||||
|
|
||||||
if (!managed_zone(zone))
|
if (pgdat_balanced(pgdat, order, classzone_idx)) {
|
||||||
continue;
|
clear_pgdat_congested(pgdat);
|
||||||
|
return true;
|
||||||
if (!zone_balanced(zone, order, classzone_idx))
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -3214,9 +3241,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
|
||||||
count_vm_event(PAGEOUTRUN);
|
count_vm_event(PAGEOUTRUN);
|
||||||
|
|
||||||
do {
|
do {
|
||||||
|
unsigned long nr_reclaimed = sc.nr_reclaimed;
|
||||||
bool raise_priority = true;
|
bool raise_priority = true;
|
||||||
|
|
||||||
sc.nr_reclaimed = 0;
|
|
||||||
sc.reclaim_idx = classzone_idx;
|
sc.reclaim_idx = classzone_idx;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -3241,23 +3268,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Only reclaim if there are no eligible zones. Check from
|
* Only reclaim if there are no eligible zones. Note that
|
||||||
* high to low zone as allocations prefer higher zones.
|
* sc.reclaim_idx is not used as buffer_heads_over_limit may
|
||||||
* Scanning from low to high zone would allow congestion to be
|
* have adjusted it.
|
||||||
* cleared during a very small window when a small low
|
|
||||||
* zone was balanced even under extreme pressure when the
|
|
||||||
* overall node may be congested. Note that sc.reclaim_idx
|
|
||||||
* is not used as buffer_heads_over_limit may have adjusted
|
|
||||||
* it.
|
|
||||||
*/
|
*/
|
||||||
for (i = classzone_idx; i >= 0; i--) {
|
if (pgdat_balanced(pgdat, sc.order, classzone_idx))
|
||||||
zone = pgdat->node_zones + i;
|
goto out;
|
||||||
if (!managed_zone(zone))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (zone_balanced(zone, sc.order, classzone_idx))
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Do some background aging of the anon list, to give
|
* Do some background aging of the anon list, to give
|
||||||
|
@ -3271,7 +3287,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
|
||||||
* If we're getting trouble reclaiming, start doing writepage
|
* If we're getting trouble reclaiming, start doing writepage
|
||||||
* even in laptop mode.
|
* even in laptop mode.
|
||||||
*/
|
*/
|
||||||
if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat))
|
if (sc.priority < DEF_PRIORITY - 2)
|
||||||
sc.may_writepage = 1;
|
sc.may_writepage = 1;
|
||||||
|
|
||||||
/* Call soft limit reclaim before calling shrink_node. */
|
/* Call soft limit reclaim before calling shrink_node. */
|
||||||
|
@ -3295,7 +3311,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
|
||||||
* able to safely make forward progress. Wake them
|
* able to safely make forward progress. Wake them
|
||||||
*/
|
*/
|
||||||
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
|
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
|
||||||
pfmemalloc_watermark_ok(pgdat))
|
allow_direct_reclaim(pgdat))
|
||||||
wake_up_all(&pgdat->pfmemalloc_wait);
|
wake_up_all(&pgdat->pfmemalloc_wait);
|
||||||
|
|
||||||
/* Check if kswapd should be suspending */
|
/* Check if kswapd should be suspending */
|
||||||
|
@ -3306,11 +3322,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
|
||||||
* Raise priority if scanning rate is too low or there was no
|
* Raise priority if scanning rate is too low or there was no
|
||||||
* progress in reclaiming pages
|
* progress in reclaiming pages
|
||||||
*/
|
*/
|
||||||
if (raise_priority || !sc.nr_reclaimed)
|
nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
|
||||||
|
if (raise_priority || !nr_reclaimed)
|
||||||
sc.priority--;
|
sc.priority--;
|
||||||
} while (sc.priority >= 1);
|
} while (sc.priority >= 1);
|
||||||
|
|
||||||
|
if (!sc.nr_reclaimed)
|
||||||
|
pgdat->kswapd_failures++;
|
||||||
|
|
||||||
out:
|
out:
|
||||||
|
snapshot_refaults(NULL, pgdat);
|
||||||
/*
|
/*
|
||||||
* Return the order kswapd stopped reclaiming at as
|
* Return the order kswapd stopped reclaiming at as
|
||||||
* prepare_kswapd_sleep() takes it into account. If another caller
|
* prepare_kswapd_sleep() takes it into account. If another caller
|
||||||
|
@ -3320,6 +3341,22 @@ out:
|
||||||
return sc.order;
|
return sc.order;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* pgdat->kswapd_classzone_idx is the highest zone index that a recent
|
||||||
|
* allocation request woke kswapd for. When kswapd has not woken recently,
|
||||||
|
* the value is MAX_NR_ZONES which is not a valid index. This compares a
|
||||||
|
* given classzone and returns it or the highest classzone index kswapd
|
||||||
|
* was recently woke for.
|
||||||
|
*/
|
||||||
|
static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
|
||||||
|
enum zone_type classzone_idx)
|
||||||
|
{
|
||||||
|
if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
|
||||||
|
return classzone_idx;
|
||||||
|
|
||||||
|
return max(pgdat->kswapd_classzone_idx, classzone_idx);
|
||||||
|
}
|
||||||
|
|
||||||
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
|
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
|
||||||
unsigned int classzone_idx)
|
unsigned int classzone_idx)
|
||||||
{
|
{
|
||||||
|
@ -3331,7 +3368,13 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
|
||||||
|
|
||||||
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
|
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
|
||||||
|
|
||||||
/* Try to sleep for a short interval */
|
/*
|
||||||
|
* Try to sleep for a short interval. Note that kcompactd will only be
|
||||||
|
* woken if it is possible to sleep for a short interval. This is
|
||||||
|
* deliberate on the assumption that if reclaim cannot keep an
|
||||||
|
* eligible zone balanced that it's also unlikely that compaction will
|
||||||
|
* succeed.
|
||||||
|
*/
|
||||||
if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
|
if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
|
||||||
/*
|
/*
|
||||||
* Compaction records what page blocks it recently failed to
|
* Compaction records what page blocks it recently failed to
|
||||||
|
@ -3355,7 +3398,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
|
||||||
* the previous request that slept prematurely.
|
* the previous request that slept prematurely.
|
||||||
*/
|
*/
|
||||||
if (remaining) {
|
if (remaining) {
|
||||||
pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
|
pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
|
||||||
pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
|
pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3409,7 +3452,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
|
||||||
*/
|
*/
|
||||||
static int kswapd(void *p)
|
static int kswapd(void *p)
|
||||||
{
|
{
|
||||||
unsigned int alloc_order, reclaim_order, classzone_idx;
|
unsigned int alloc_order, reclaim_order;
|
||||||
|
unsigned int classzone_idx = MAX_NR_ZONES - 1;
|
||||||
pg_data_t *pgdat = (pg_data_t*)p;
|
pg_data_t *pgdat = (pg_data_t*)p;
|
||||||
struct task_struct *tsk = current;
|
struct task_struct *tsk = current;
|
||||||
|
|
||||||
|
@ -3439,20 +3483,23 @@ static int kswapd(void *p)
|
||||||
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
|
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
|
||||||
set_freezable();
|
set_freezable();
|
||||||
|
|
||||||
pgdat->kswapd_order = alloc_order = reclaim_order = 0;
|
pgdat->kswapd_order = 0;
|
||||||
pgdat->kswapd_classzone_idx = classzone_idx = 0;
|
pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
|
||||||
for ( ; ; ) {
|
for ( ; ; ) {
|
||||||
bool ret;
|
bool ret;
|
||||||
|
|
||||||
|
alloc_order = reclaim_order = pgdat->kswapd_order;
|
||||||
|
classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
|
||||||
|
|
||||||
kswapd_try_sleep:
|
kswapd_try_sleep:
|
||||||
kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
|
kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
|
||||||
classzone_idx);
|
classzone_idx);
|
||||||
|
|
||||||
/* Read the new order and classzone_idx */
|
/* Read the new order and classzone_idx */
|
||||||
alloc_order = reclaim_order = pgdat->kswapd_order;
|
alloc_order = reclaim_order = pgdat->kswapd_order;
|
||||||
classzone_idx = pgdat->kswapd_classzone_idx;
|
classzone_idx = kswapd_classzone_idx(pgdat, 0);
|
||||||
pgdat->kswapd_order = 0;
|
pgdat->kswapd_order = 0;
|
||||||
pgdat->kswapd_classzone_idx = 0;
|
pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
|
||||||
|
|
||||||
ret = try_to_freeze();
|
ret = try_to_freeze();
|
||||||
if (kthread_should_stop())
|
if (kthread_should_stop())
|
||||||
|
@ -3478,9 +3525,6 @@ kswapd_try_sleep:
|
||||||
reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
|
reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
|
||||||
if (reclaim_order < alloc_order)
|
if (reclaim_order < alloc_order)
|
||||||
goto kswapd_try_sleep;
|
goto kswapd_try_sleep;
|
||||||
|
|
||||||
alloc_order = reclaim_order = pgdat->kswapd_order;
|
|
||||||
classzone_idx = pgdat->kswapd_classzone_idx;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
|
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
|
||||||
|
@ -3496,7 +3540,6 @@ kswapd_try_sleep:
|
||||||
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
|
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
|
||||||
{
|
{
|
||||||
pg_data_t *pgdat;
|
pg_data_t *pgdat;
|
||||||
int z;
|
|
||||||
|
|
||||||
if (!managed_zone(zone))
|
if (!managed_zone(zone))
|
||||||
return;
|
return;
|
||||||
|
@ -3504,22 +3547,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
|
||||||
if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
|
if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
|
||||||
return;
|
return;
|
||||||
pgdat = zone->zone_pgdat;
|
pgdat = zone->zone_pgdat;
|
||||||
pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
|
pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
|
||||||
|
classzone_idx);
|
||||||
pgdat->kswapd_order = max(pgdat->kswapd_order, order);
|
pgdat->kswapd_order = max(pgdat->kswapd_order, order);
|
||||||
if (!waitqueue_active(&pgdat->kswapd_wait))
|
if (!waitqueue_active(&pgdat->kswapd_wait))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/* Only wake kswapd if all zones are unbalanced */
|
/* Hopeless node, leave it to direct reclaim */
|
||||||
for (z = 0; z <= classzone_idx; z++) {
|
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
|
||||||
zone = pgdat->node_zones + z;
|
return;
|
||||||
if (!managed_zone(zone))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (zone_balanced(zone, order, classzone_idx))
|
if (pgdat_balanced(pgdat, order, classzone_idx))
|
||||||
return;
|
return;
|
||||||
}
|
|
||||||
|
|
||||||
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
|
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
|
||||||
wake_up_interruptible(&pgdat->kswapd_wait);
|
wake_up_interruptible(&pgdat->kswapd_wait);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3725,7 +3766,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
|
||||||
int classzone_idx = gfp_zone(gfp_mask);
|
int classzone_idx = gfp_zone(gfp_mask);
|
||||||
struct scan_control sc = {
|
struct scan_control sc = {
|
||||||
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
|
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
|
||||||
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
|
.gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
|
||||||
.order = order,
|
.order = order,
|
||||||
.priority = NODE_RECLAIM_PRIORITY,
|
.priority = NODE_RECLAIM_PRIORITY,
|
||||||
.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
|
.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
|
||||||
|
@ -3779,9 +3820,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
|
||||||
sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
|
sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
|
||||||
return NODE_RECLAIM_FULL;
|
return NODE_RECLAIM_FULL;
|
||||||
|
|
||||||
if (!pgdat_reclaimable(pgdat))
|
|
||||||
return NODE_RECLAIM_FULL;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Do not scan if the allocation should not be delayed.
|
* Do not scan if the allocation should not be delayed.
|
||||||
*/
|
*/
|
||||||
|
|
72
mm/vmstat.c
72
mm/vmstat.c
|
@ -954,7 +954,6 @@ const char * const vmstat_text[] = {
|
||||||
"nr_unevictable",
|
"nr_unevictable",
|
||||||
"nr_isolated_anon",
|
"nr_isolated_anon",
|
||||||
"nr_isolated_file",
|
"nr_isolated_file",
|
||||||
"nr_pages_scanned",
|
|
||||||
"workingset_refault",
|
"workingset_refault",
|
||||||
"workingset_activate",
|
"workingset_activate",
|
||||||
"workingset_nodereclaim",
|
"workingset_nodereclaim",
|
||||||
|
@ -992,6 +991,7 @@ const char * const vmstat_text[] = {
|
||||||
"pgfree",
|
"pgfree",
|
||||||
"pgactivate",
|
"pgactivate",
|
||||||
"pgdeactivate",
|
"pgdeactivate",
|
||||||
|
"pglazyfree",
|
||||||
|
|
||||||
"pgfault",
|
"pgfault",
|
||||||
"pgmajfault",
|
"pgmajfault",
|
||||||
|
@ -1124,8 +1124,12 @@ static void frag_stop(struct seq_file *m, void *arg)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Walk all the zones in a node and print using a callback */
|
/*
|
||||||
|
* Walk zones in a node and print using a callback.
|
||||||
|
* If @assert_populated is true, only use callback for zones that are populated.
|
||||||
|
*/
|
||||||
static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
|
static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
|
||||||
|
bool assert_populated,
|
||||||
void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
|
void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
|
||||||
{
|
{
|
||||||
struct zone *zone;
|
struct zone *zone;
|
||||||
|
@ -1133,7 +1137,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
|
||||||
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
|
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
|
||||||
if (!populated_zone(zone))
|
if (assert_populated && !populated_zone(zone))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
spin_lock_irqsave(&zone->lock, flags);
|
spin_lock_irqsave(&zone->lock, flags);
|
||||||
|
@ -1161,7 +1165,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
|
||||||
static int frag_show(struct seq_file *m, void *arg)
|
static int frag_show(struct seq_file *m, void *arg)
|
||||||
{
|
{
|
||||||
pg_data_t *pgdat = (pg_data_t *)arg;
|
pg_data_t *pgdat = (pg_data_t *)arg;
|
||||||
walk_zones_in_node(m, pgdat, frag_show_print);
|
walk_zones_in_node(m, pgdat, true, frag_show_print);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1202,7 +1206,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
|
||||||
seq_printf(m, "%6d ", order);
|
seq_printf(m, "%6d ", order);
|
||||||
seq_putc(m, '\n');
|
seq_putc(m, '\n');
|
||||||
|
|
||||||
walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
|
walk_zones_in_node(m, pgdat, true, pagetypeinfo_showfree_print);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1254,7 +1258,7 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
|
||||||
for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
|
for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
|
||||||
seq_printf(m, "%12s ", migratetype_names[mtype]);
|
seq_printf(m, "%12s ", migratetype_names[mtype]);
|
||||||
seq_putc(m, '\n');
|
seq_putc(m, '\n');
|
||||||
walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
|
walk_zones_in_node(m, pgdat, true, pagetypeinfo_showblockcount_print);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1280,7 +1284,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
|
||||||
seq_printf(m, "%12s ", migratetype_names[mtype]);
|
seq_printf(m, "%12s ", migratetype_names[mtype]);
|
||||||
seq_putc(m, '\n');
|
seq_putc(m, '\n');
|
||||||
|
|
||||||
walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print);
|
walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print);
|
||||||
#endif /* CONFIG_PAGE_OWNER */
|
#endif /* CONFIG_PAGE_OWNER */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1378,7 +1382,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
|
||||||
"\n min %lu"
|
"\n min %lu"
|
||||||
"\n low %lu"
|
"\n low %lu"
|
||||||
"\n high %lu"
|
"\n high %lu"
|
||||||
"\n node_scanned %lu"
|
|
||||||
"\n spanned %lu"
|
"\n spanned %lu"
|
||||||
"\n present %lu"
|
"\n present %lu"
|
||||||
"\n managed %lu",
|
"\n managed %lu",
|
||||||
|
@ -1386,23 +1389,28 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
|
||||||
min_wmark_pages(zone),
|
min_wmark_pages(zone),
|
||||||
low_wmark_pages(zone),
|
low_wmark_pages(zone),
|
||||||
high_wmark_pages(zone),
|
high_wmark_pages(zone),
|
||||||
node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED),
|
|
||||||
zone->spanned_pages,
|
zone->spanned_pages,
|
||||||
zone->present_pages,
|
zone->present_pages,
|
||||||
zone->managed_pages);
|
zone->managed_pages);
|
||||||
|
|
||||||
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
|
|
||||||
seq_printf(m, "\n %-12s %lu", vmstat_text[i],
|
|
||||||
zone_page_state(zone, i));
|
|
||||||
|
|
||||||
seq_printf(m,
|
seq_printf(m,
|
||||||
"\n protection: (%ld",
|
"\n protection: (%ld",
|
||||||
zone->lowmem_reserve[0]);
|
zone->lowmem_reserve[0]);
|
||||||
for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
|
for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
|
||||||
seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
|
seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
|
||||||
seq_printf(m,
|
seq_putc(m, ')');
|
||||||
")"
|
|
||||||
"\n pagesets");
|
/* If unpopulated, no other information is useful */
|
||||||
|
if (!populated_zone(zone)) {
|
||||||
|
seq_putc(m, '\n');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
|
||||||
|
seq_printf(m, "\n %-12s %lu", vmstat_text[i],
|
||||||
|
zone_page_state(zone, i));
|
||||||
|
|
||||||
|
seq_printf(m, "\n pagesets");
|
||||||
for_each_online_cpu(i) {
|
for_each_online_cpu(i) {
|
||||||
struct per_cpu_pageset *pageset;
|
struct per_cpu_pageset *pageset;
|
||||||
|
|
||||||
|
@ -1425,19 +1433,22 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
|
||||||
"\n node_unreclaimable: %u"
|
"\n node_unreclaimable: %u"
|
||||||
"\n start_pfn: %lu"
|
"\n start_pfn: %lu"
|
||||||
"\n node_inactive_ratio: %u",
|
"\n node_inactive_ratio: %u",
|
||||||
!pgdat_reclaimable(zone->zone_pgdat),
|
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
|
||||||
zone->zone_start_pfn,
|
zone->zone_start_pfn,
|
||||||
zone->zone_pgdat->inactive_ratio);
|
zone->zone_pgdat->inactive_ratio);
|
||||||
seq_putc(m, '\n');
|
seq_putc(m, '\n');
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Output information about zones in @pgdat.
|
* Output information about zones in @pgdat. All zones are printed regardless
|
||||||
|
* of whether they are populated or not: lowmem_reserve_ratio operates on the
|
||||||
|
* set of all zones and userspace would not be aware of such zones if they are
|
||||||
|
* suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
|
||||||
*/
|
*/
|
||||||
static int zoneinfo_show(struct seq_file *m, void *arg)
|
static int zoneinfo_show(struct seq_file *m, void *arg)
|
||||||
{
|
{
|
||||||
pg_data_t *pgdat = (pg_data_t *)arg;
|
pg_data_t *pgdat = (pg_data_t *)arg;
|
||||||
walk_zones_in_node(m, pgdat, zoneinfo_show_print);
|
walk_zones_in_node(m, pgdat, false, zoneinfo_show_print);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1586,22 +1597,9 @@ int vmstat_refresh(struct ctl_table *table, int write,
|
||||||
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
|
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
|
||||||
val = atomic_long_read(&vm_zone_stat[i]);
|
val = atomic_long_read(&vm_zone_stat[i]);
|
||||||
if (val < 0) {
|
if (val < 0) {
|
||||||
switch (i) {
|
pr_warn("%s: %s %ld\n",
|
||||||
case NR_PAGES_SCANNED:
|
__func__, vmstat_text[i], val);
|
||||||
/*
|
err = -EINVAL;
|
||||||
* This is often seen to go negative in
|
|
||||||
* recent kernels, but not to go permanently
|
|
||||||
* negative. Whilst it would be nicer not to
|
|
||||||
* have exceptions, rooting them out would be
|
|
||||||
* another task, of rather low priority.
|
|
||||||
*/
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
pr_warn("%s: %s %ld\n",
|
|
||||||
__func__, vmstat_text[i], val);
|
|
||||||
err = -EINVAL;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (err)
|
if (err)
|
||||||
|
@ -1856,7 +1854,7 @@ static int unusable_show(struct seq_file *m, void *arg)
|
||||||
if (!node_state(pgdat->node_id, N_MEMORY))
|
if (!node_state(pgdat->node_id, N_MEMORY))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
walk_zones_in_node(m, pgdat, unusable_show_print);
|
walk_zones_in_node(m, pgdat, true, unusable_show_print);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1908,7 +1906,7 @@ static int extfrag_show(struct seq_file *m, void *arg)
|
||||||
{
|
{
|
||||||
pg_data_t *pgdat = (pg_data_t *)arg;
|
pg_data_t *pgdat = (pg_data_t *)arg;
|
||||||
|
|
||||||
walk_zones_in_node(m, pgdat, extfrag_show_print);
|
walk_zones_in_node(m, pgdat, true, extfrag_show_print);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -269,7 +269,6 @@ bool workingset_refault(void *shadow)
|
||||||
lruvec = mem_cgroup_lruvec(pgdat, memcg);
|
lruvec = mem_cgroup_lruvec(pgdat, memcg);
|
||||||
refault = atomic_long_read(&lruvec->inactive_age);
|
refault = atomic_long_read(&lruvec->inactive_age);
|
||||||
active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
|
active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
|
||||||
rcu_read_unlock();
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The unsigned subtraction here gives an accurate distance
|
* The unsigned subtraction here gives an accurate distance
|
||||||
|
@ -290,11 +289,15 @@ bool workingset_refault(void *shadow)
|
||||||
refault_distance = (refault - eviction) & EVICTION_MASK;
|
refault_distance = (refault - eviction) & EVICTION_MASK;
|
||||||
|
|
||||||
inc_node_state(pgdat, WORKINGSET_REFAULT);
|
inc_node_state(pgdat, WORKINGSET_REFAULT);
|
||||||
|
inc_memcg_state(memcg, WORKINGSET_REFAULT);
|
||||||
|
|
||||||
if (refault_distance <= active_file) {
|
if (refault_distance <= active_file) {
|
||||||
inc_node_state(pgdat, WORKINGSET_ACTIVATE);
|
inc_node_state(pgdat, WORKINGSET_ACTIVATE);
|
||||||
|
inc_memcg_state(memcg, WORKINGSET_ACTIVATE);
|
||||||
|
rcu_read_unlock();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
rcu_read_unlock();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -472,6 +475,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
|
||||||
if (WARN_ON_ONCE(node->exceptional))
|
if (WARN_ON_ONCE(node->exceptional))
|
||||||
goto out_invalid;
|
goto out_invalid;
|
||||||
inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
|
inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
|
||||||
|
inc_memcg_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
|
||||||
__radix_tree_delete_node(&mapping->page_tree, node,
|
__radix_tree_delete_node(&mapping->page_tree, node,
|
||||||
workingset_update_node, mapping);
|
workingset_update_node, mapping);
|
||||||
|
|
||||||
|
|
|
@ -46,6 +46,8 @@ ackowledge||acknowledge
|
||||||
ackowledged||acknowledged
|
ackowledged||acknowledged
|
||||||
acording||according
|
acording||according
|
||||||
activete||activate
|
activete||activate
|
||||||
|
actived||activated
|
||||||
|
actualy||actually
|
||||||
acumulating||accumulating
|
acumulating||accumulating
|
||||||
acumulator||accumulator
|
acumulator||accumulator
|
||||||
adapater||adapter
|
adapater||adapter
|
||||||
|
@ -76,6 +78,8 @@ algorritm||algorithm
|
||||||
aligment||alignment
|
aligment||alignment
|
||||||
alignement||alignment
|
alignement||alignment
|
||||||
allign||align
|
allign||align
|
||||||
|
alligned||aligned
|
||||||
|
allocatote||allocate
|
||||||
allocatrd||allocated
|
allocatrd||allocated
|
||||||
allocte||allocate
|
allocte||allocate
|
||||||
allpication||application
|
allpication||application
|
||||||
|
@ -141,6 +145,7 @@ asycronous||asynchronous
|
||||||
asynchnous||asynchronous
|
asynchnous||asynchronous
|
||||||
atomatically||automatically
|
atomatically||automatically
|
||||||
atomicly||atomically
|
atomicly||atomically
|
||||||
|
atempt||attempt
|
||||||
attachement||attachment
|
attachement||attachment
|
||||||
attched||attached
|
attched||attached
|
||||||
attemps||attempts
|
attemps||attempts
|
||||||
|
@ -270,6 +275,7 @@ comunication||communication
|
||||||
conbination||combination
|
conbination||combination
|
||||||
conditionaly||conditionally
|
conditionaly||conditionally
|
||||||
conected||connected
|
conected||connected
|
||||||
|
connecetd||connected
|
||||||
configuartion||configuration
|
configuartion||configuration
|
||||||
configuratoin||configuration
|
configuratoin||configuration
|
||||||
configuraton||configuration
|
configuraton||configuration
|
||||||
|
@ -291,11 +297,14 @@ continous||continuous
|
||||||
continously||continuously
|
continously||continuously
|
||||||
continueing||continuing
|
continueing||continuing
|
||||||
contraints||constraints
|
contraints||constraints
|
||||||
|
contol||control
|
||||||
|
contoller||controller
|
||||||
controled||controlled
|
controled||controlled
|
||||||
controler||controller
|
controler||controller
|
||||||
controll||control
|
controll||control
|
||||||
contruction||construction
|
contruction||construction
|
||||||
contry||country
|
contry||country
|
||||||
|
conuntry||country
|
||||||
convertion||conversion
|
convertion||conversion
|
||||||
convertor||converter
|
convertor||converter
|
||||||
convienient||convenient
|
convienient||convenient
|
||||||
|
@ -310,6 +319,7 @@ coutner||counter
|
||||||
cryptocraphic||cryptographic
|
cryptocraphic||cryptographic
|
||||||
cunter||counter
|
cunter||counter
|
||||||
curently||currently
|
curently||currently
|
||||||
|
cylic||cyclic
|
||||||
dafault||default
|
dafault||default
|
||||||
deafult||default
|
deafult||default
|
||||||
deamon||daemon
|
deamon||daemon
|
||||||
|
@ -398,6 +408,7 @@ efective||effective
|
||||||
efficently||efficiently
|
efficently||efficiently
|
||||||
ehther||ether
|
ehther||ether
|
||||||
eigth||eight
|
eigth||eight
|
||||||
|
elementry||elementary
|
||||||
eletronic||electronic
|
eletronic||electronic
|
||||||
embeded||embedded
|
embeded||embedded
|
||||||
enabledi||enabled
|
enabledi||enabled
|
||||||
|
@ -443,6 +454,7 @@ extened||extended
|
||||||
extensability||extensibility
|
extensability||extensibility
|
||||||
extention||extension
|
extention||extension
|
||||||
extracter||extractor
|
extracter||extractor
|
||||||
|
falied||failed
|
||||||
faild||failed
|
faild||failed
|
||||||
faill||fail
|
faill||fail
|
||||||
failied||failed
|
failied||failed
|
||||||
|
@ -492,6 +504,7 @@ futhermore||furthermore
|
||||||
futrue||future
|
futrue||future
|
||||||
gaurenteed||guaranteed
|
gaurenteed||guaranteed
|
||||||
generiously||generously
|
generiously||generously
|
||||||
|
genereate||generate
|
||||||
genric||generic
|
genric||generic
|
||||||
globel||global
|
globel||global
|
||||||
grabing||grabbing
|
grabing||grabbing
|
||||||
|
@ -513,8 +526,10 @@ hierachy||hierarchy
|
||||||
hierarchie||hierarchy
|
hierarchie||hierarchy
|
||||||
howver||however
|
howver||however
|
||||||
hsould||should
|
hsould||should
|
||||||
|
hypervior||hypervisor
|
||||||
hypter||hyper
|
hypter||hyper
|
||||||
identidier||identifier
|
identidier||identifier
|
||||||
|
iligal||illegal
|
||||||
illigal||illegal
|
illigal||illegal
|
||||||
imblance||imbalance
|
imblance||imbalance
|
||||||
immeadiately||immediately
|
immeadiately||immediately
|
||||||
|
@ -600,6 +615,7 @@ intuative||intuitive
|
||||||
invaid||invalid
|
invaid||invalid
|
||||||
invalde||invalid
|
invalde||invalid
|
||||||
invalide||invalid
|
invalide||invalid
|
||||||
|
invalud||invalid
|
||||||
invididual||individual
|
invididual||individual
|
||||||
invokation||invocation
|
invokation||invocation
|
||||||
invokations||invocations
|
invokations||invocations
|
||||||
|
@ -663,11 +679,14 @@ messsages||messages
|
||||||
microprocesspr||microprocessor
|
microprocesspr||microprocessor
|
||||||
milliseonds||milliseconds
|
milliseonds||milliseconds
|
||||||
minium||minimum
|
minium||minimum
|
||||||
|
minimam||minimum
|
||||||
minumum||minimum
|
minumum||minimum
|
||||||
|
misalinged||misaligned
|
||||||
miscelleneous||miscellaneous
|
miscelleneous||miscellaneous
|
||||||
misformed||malformed
|
misformed||malformed
|
||||||
mispelled||misspelled
|
mispelled||misspelled
|
||||||
mispelt||misspelt
|
mispelt||misspelt
|
||||||
|
mising||missing
|
||||||
miximum||maximum
|
miximum||maximum
|
||||||
mmnemonic||mnemonic
|
mmnemonic||mnemonic
|
||||||
mnay||many
|
mnay||many
|
||||||
|
@ -888,6 +907,7 @@ replys||replies
|
||||||
reponse||response
|
reponse||response
|
||||||
representaion||representation
|
representaion||representation
|
||||||
reqeust||request
|
reqeust||request
|
||||||
|
requestied||requested
|
||||||
requiere||require
|
requiere||require
|
||||||
requirment||requirement
|
requirment||requirement
|
||||||
requred||required
|
requred||required
|
||||||
|
@ -981,6 +1001,7 @@ spinlcok||spinlock
|
||||||
spinock||spinlock
|
spinock||spinlock
|
||||||
splitted||split
|
splitted||split
|
||||||
spreaded||spread
|
spreaded||spread
|
||||||
|
spurrious||spurious
|
||||||
sructure||structure
|
sructure||structure
|
||||||
stablilization||stabilization
|
stablilization||stabilization
|
||||||
staically||statically
|
staically||statically
|
||||||
|
@ -1013,6 +1034,7 @@ superseeded||superseded
|
||||||
suplied||supplied
|
suplied||supplied
|
||||||
suported||supported
|
suported||supported
|
||||||
suport||support
|
suport||support
|
||||||
|
supportet||supported
|
||||||
suppored||supported
|
suppored||supported
|
||||||
supportin||supporting
|
supportin||supporting
|
||||||
suppoted||supported
|
suppoted||supported
|
||||||
|
@ -1056,6 +1078,7 @@ throught||through
|
||||||
thses||these
|
thses||these
|
||||||
tiggered||triggered
|
tiggered||triggered
|
||||||
tipically||typically
|
tipically||typically
|
||||||
|
timout||timeout
|
||||||
tmis||this
|
tmis||this
|
||||||
torerable||tolerable
|
torerable||tolerable
|
||||||
tramsmitted||transmitted
|
tramsmitted||transmitted
|
||||||
|
@ -1081,6 +1104,7 @@ unconditionaly||unconditionally
|
||||||
underun||underrun
|
underun||underrun
|
||||||
unecessary||unnecessary
|
unecessary||unnecessary
|
||||||
unexecpted||unexpected
|
unexecpted||unexpected
|
||||||
|
unexepected||unexpected
|
||||||
unexpcted||unexpected
|
unexpcted||unexpected
|
||||||
unexpectd||unexpected
|
unexpectd||unexpected
|
||||||
unexpeted||unexpected
|
unexpeted||unexpected
|
||||||
|
@ -1096,6 +1120,7 @@ unneded||unneeded
|
||||||
unneedingly||unnecessarily
|
unneedingly||unnecessarily
|
||||||
unnsupported||unsupported
|
unnsupported||unsupported
|
||||||
unmached||unmatched
|
unmached||unmatched
|
||||||
|
unregester||unregister
|
||||||
unresgister||unregister
|
unresgister||unregister
|
||||||
unrgesiter||unregister
|
unrgesiter||unregister
|
||||||
unsinged||unsigned
|
unsinged||unsigned
|
||||||
|
|
|
@ -15,21 +15,14 @@ TEST_GEN_FILES += on-fault-limit
|
||||||
TEST_GEN_FILES += thuge-gen
|
TEST_GEN_FILES += thuge-gen
|
||||||
TEST_GEN_FILES += transhuge-stress
|
TEST_GEN_FILES += transhuge-stress
|
||||||
TEST_GEN_FILES += userfaultfd
|
TEST_GEN_FILES += userfaultfd
|
||||||
TEST_GEN_FILES += userfaultfd_hugetlb
|
|
||||||
TEST_GEN_FILES += userfaultfd_shmem
|
|
||||||
TEST_GEN_FILES += mlock-random-test
|
TEST_GEN_FILES += mlock-random-test
|
||||||
|
|
||||||
TEST_PROGS := run_vmtests
|
TEST_PROGS := run_vmtests
|
||||||
|
|
||||||
include ../lib.mk
|
include ../lib.mk
|
||||||
|
|
||||||
$(OUTPUT)/userfaultfd: LDLIBS += -lpthread ../../../../usr/include/linux/kernel.h
|
$(OUTPUT)/userfaultfd: ../../../../usr/include/linux/kernel.h
|
||||||
|
$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
|
||||||
$(OUTPUT)/userfaultfd_hugetlb: userfaultfd.c ../../../../usr/include/linux/kernel.h
|
|
||||||
$(CC) $(CFLAGS) -DHUGETLB_TEST -O2 -o $@ $< -lpthread
|
|
||||||
|
|
||||||
$(OUTPUT)/userfaultfd_shmem: userfaultfd.c ../../../../usr/include/linux/kernel.h
|
|
||||||
$(CC) $(CFLAGS) -DSHMEM_TEST -O2 -o $@ $< -lpthread
|
|
||||||
|
|
||||||
$(OUTPUT)/mlock-random-test: LDLIBS += -lcap
|
$(OUTPUT)/mlock-random-test: LDLIBS += -lcap
|
||||||
|
|
||||||
|
|
|
@ -95,7 +95,7 @@ echo " hugetlb regression testing."
|
||||||
echo "--------------------"
|
echo "--------------------"
|
||||||
echo "running userfaultfd"
|
echo "running userfaultfd"
|
||||||
echo "--------------------"
|
echo "--------------------"
|
||||||
./userfaultfd 128 32
|
./userfaultfd anon 128 32
|
||||||
if [ $? -ne 0 ]; then
|
if [ $? -ne 0 ]; then
|
||||||
echo "[FAIL]"
|
echo "[FAIL]"
|
||||||
exitcode=1
|
exitcode=1
|
||||||
|
@ -107,7 +107,7 @@ echo "----------------------------"
|
||||||
echo "running userfaultfd_hugetlb"
|
echo "running userfaultfd_hugetlb"
|
||||||
echo "----------------------------"
|
echo "----------------------------"
|
||||||
# 258MB total huge pages == 128MB src and 128MB dst
|
# 258MB total huge pages == 128MB src and 128MB dst
|
||||||
./userfaultfd_hugetlb 128 32 $mnt/ufd_test_file
|
./userfaultfd hugetlb 128 32 $mnt/ufd_test_file
|
||||||
if [ $? -ne 0 ]; then
|
if [ $? -ne 0 ]; then
|
||||||
echo "[FAIL]"
|
echo "[FAIL]"
|
||||||
exitcode=1
|
exitcode=1
|
||||||
|
@ -119,7 +119,7 @@ rm -f $mnt/ufd_test_file
|
||||||
echo "----------------------------"
|
echo "----------------------------"
|
||||||
echo "running userfaultfd_shmem"
|
echo "running userfaultfd_shmem"
|
||||||
echo "----------------------------"
|
echo "----------------------------"
|
||||||
./userfaultfd_shmem 128 32
|
./userfaultfd shmem 128 32
|
||||||
if [ $? -ne 0 ]; then
|
if [ $? -ne 0 ]; then
|
||||||
echo "[FAIL]"
|
echo "[FAIL]"
|
||||||
exitcode=1
|
exitcode=1
|
||||||
|
|
|
@ -77,10 +77,13 @@ static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
|
||||||
#define BOUNCE_POLL (1<<3)
|
#define BOUNCE_POLL (1<<3)
|
||||||
static int bounces;
|
static int bounces;
|
||||||
|
|
||||||
#ifdef HUGETLB_TEST
|
#define TEST_ANON 1
|
||||||
|
#define TEST_HUGETLB 2
|
||||||
|
#define TEST_SHMEM 3
|
||||||
|
static int test_type;
|
||||||
|
|
||||||
static int huge_fd;
|
static int huge_fd;
|
||||||
static char *huge_fd_off0;
|
static char *huge_fd_off0;
|
||||||
#endif
|
|
||||||
static unsigned long long *count_verify;
|
static unsigned long long *count_verify;
|
||||||
static int uffd, uffd_flags, finished, *pipefd;
|
static int uffd, uffd_flags, finished, *pipefd;
|
||||||
static char *area_src, *area_dst;
|
static char *area_src, *area_dst;
|
||||||
|
@ -102,14 +105,7 @@ pthread_attr_t attr;
|
||||||
~(unsigned long)(sizeof(unsigned long long) \
|
~(unsigned long)(sizeof(unsigned long long) \
|
||||||
- 1)))
|
- 1)))
|
||||||
|
|
||||||
#if !defined(HUGETLB_TEST) && !defined(SHMEM_TEST)
|
static int anon_release_pages(char *rel_area)
|
||||||
|
|
||||||
/* Anonymous memory */
|
|
||||||
#define EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \
|
|
||||||
(1 << _UFFDIO_COPY) | \
|
|
||||||
(1 << _UFFDIO_ZEROPAGE))
|
|
||||||
|
|
||||||
static int release_pages(char *rel_area)
|
|
||||||
{
|
{
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
|
@ -121,7 +117,7 @@ static int release_pages(char *rel_area)
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void allocate_area(void **alloc_area)
|
static void anon_allocate_area(void **alloc_area)
|
||||||
{
|
{
|
||||||
if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
|
if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
|
||||||
fprintf(stderr, "out of memory\n");
|
fprintf(stderr, "out of memory\n");
|
||||||
|
@ -129,14 +125,9 @@ static void allocate_area(void **alloc_area)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#else /* HUGETLB_TEST or SHMEM_TEST */
|
|
||||||
|
|
||||||
#define EXPECTED_IOCTLS UFFD_API_RANGE_IOCTLS_BASIC
|
|
||||||
|
|
||||||
#ifdef HUGETLB_TEST
|
|
||||||
|
|
||||||
/* HugeTLB memory */
|
/* HugeTLB memory */
|
||||||
static int release_pages(char *rel_area)
|
static int hugetlb_release_pages(char *rel_area)
|
||||||
{
|
{
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
|
@ -152,7 +143,7 @@ static int release_pages(char *rel_area)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void allocate_area(void **alloc_area)
|
static void hugetlb_allocate_area(void **alloc_area)
|
||||||
{
|
{
|
||||||
*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
|
*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
|
||||||
MAP_PRIVATE | MAP_HUGETLB, huge_fd,
|
MAP_PRIVATE | MAP_HUGETLB, huge_fd,
|
||||||
|
@ -167,10 +158,8 @@ static void allocate_area(void **alloc_area)
|
||||||
huge_fd_off0 = *alloc_area;
|
huge_fd_off0 = *alloc_area;
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif defined(SHMEM_TEST)
|
|
||||||
|
|
||||||
/* Shared memory */
|
/* Shared memory */
|
||||||
static int release_pages(char *rel_area)
|
static int shmem_release_pages(char *rel_area)
|
||||||
{
|
{
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
|
@ -182,7 +171,7 @@ static int release_pages(char *rel_area)
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void allocate_area(void **alloc_area)
|
static void shmem_allocate_area(void **alloc_area)
|
||||||
{
|
{
|
||||||
*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
|
*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
|
||||||
MAP_ANONYMOUS | MAP_SHARED, -1, 0);
|
MAP_ANONYMOUS | MAP_SHARED, -1, 0);
|
||||||
|
@ -192,11 +181,35 @@ static void allocate_area(void **alloc_area)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#else /* SHMEM_TEST */
|
struct uffd_test_ops {
|
||||||
#error "Undefined test type"
|
unsigned long expected_ioctls;
|
||||||
#endif /* HUGETLB_TEST */
|
void (*allocate_area)(void **alloc_area);
|
||||||
|
int (*release_pages)(char *rel_area);
|
||||||
|
};
|
||||||
|
|
||||||
#endif /* !defined(HUGETLB_TEST) && !defined(SHMEM_TEST) */
|
#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \
|
||||||
|
(1 << _UFFDIO_COPY) | \
|
||||||
|
(1 << _UFFDIO_ZEROPAGE))
|
||||||
|
|
||||||
|
static struct uffd_test_ops anon_uffd_test_ops = {
|
||||||
|
.expected_ioctls = ANON_EXPECTED_IOCTLS,
|
||||||
|
.allocate_area = anon_allocate_area,
|
||||||
|
.release_pages = anon_release_pages,
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct uffd_test_ops shmem_uffd_test_ops = {
|
||||||
|
.expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
|
||||||
|
.allocate_area = shmem_allocate_area,
|
||||||
|
.release_pages = shmem_release_pages,
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct uffd_test_ops hugetlb_uffd_test_ops = {
|
||||||
|
.expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
|
||||||
|
.allocate_area = hugetlb_allocate_area,
|
||||||
|
.release_pages = hugetlb_release_pages,
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct uffd_test_ops *uffd_test_ops;
|
||||||
|
|
||||||
static int my_bcmp(char *str1, char *str2, size_t n)
|
static int my_bcmp(char *str1, char *str2, size_t n)
|
||||||
{
|
{
|
||||||
|
@ -505,7 +518,7 @@ static int stress(unsigned long *userfaults)
|
||||||
* UFFDIO_COPY without writing zero pages into area_dst
|
* UFFDIO_COPY without writing zero pages into area_dst
|
||||||
* because the background threads already completed).
|
* because the background threads already completed).
|
||||||
*/
|
*/
|
||||||
if (release_pages(area_src))
|
if (uffd_test_ops->release_pages(area_src))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
for (cpu = 0; cpu < nr_cpus; cpu++) {
|
for (cpu = 0; cpu < nr_cpus; cpu++) {
|
||||||
|
@ -577,12 +590,12 @@ static int faulting_process(void)
|
||||||
{
|
{
|
||||||
unsigned long nr;
|
unsigned long nr;
|
||||||
unsigned long long count;
|
unsigned long long count;
|
||||||
|
unsigned long split_nr_pages;
|
||||||
|
|
||||||
#ifndef HUGETLB_TEST
|
if (test_type != TEST_HUGETLB)
|
||||||
unsigned long split_nr_pages = (nr_pages + 1) / 2;
|
split_nr_pages = (nr_pages + 1) / 2;
|
||||||
#else
|
else
|
||||||
unsigned long split_nr_pages = nr_pages;
|
split_nr_pages = nr_pages;
|
||||||
#endif
|
|
||||||
|
|
||||||
for (nr = 0; nr < split_nr_pages; nr++) {
|
for (nr = 0; nr < split_nr_pages; nr++) {
|
||||||
count = *area_count(area_dst, nr);
|
count = *area_count(area_dst, nr);
|
||||||
|
@ -594,7 +607,9 @@ static int faulting_process(void)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef HUGETLB_TEST
|
if (test_type == TEST_HUGETLB)
|
||||||
|
return 0;
|
||||||
|
|
||||||
area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size,
|
area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size,
|
||||||
MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
|
MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
|
||||||
if (area_dst == MAP_FAILED)
|
if (area_dst == MAP_FAILED)
|
||||||
|
@ -610,7 +625,7 @@ static int faulting_process(void)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (release_pages(area_dst))
|
if (uffd_test_ops->release_pages(area_dst))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
for (nr = 0; nr < nr_pages; nr++) {
|
for (nr = 0; nr < nr_pages; nr++) {
|
||||||
|
@ -618,8 +633,6 @@ static int faulting_process(void)
|
||||||
fprintf(stderr, "nr %lu is not zero\n", nr), exit(1);
|
fprintf(stderr, "nr %lu is not zero\n", nr), exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* HUGETLB_TEST */
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -627,7 +640,9 @@ static int uffdio_zeropage(int ufd, unsigned long offset)
|
||||||
{
|
{
|
||||||
struct uffdio_zeropage uffdio_zeropage;
|
struct uffdio_zeropage uffdio_zeropage;
|
||||||
int ret;
|
int ret;
|
||||||
unsigned long has_zeropage = EXPECTED_IOCTLS & (1 << _UFFDIO_ZEROPAGE);
|
unsigned long has_zeropage;
|
||||||
|
|
||||||
|
has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
|
||||||
|
|
||||||
if (offset >= nr_pages * page_size)
|
if (offset >= nr_pages * page_size)
|
||||||
fprintf(stderr, "unexpected offset %lu\n",
|
fprintf(stderr, "unexpected offset %lu\n",
|
||||||
|
@ -675,7 +690,7 @@ static int userfaultfd_zeropage_test(void)
|
||||||
printf("testing UFFDIO_ZEROPAGE: ");
|
printf("testing UFFDIO_ZEROPAGE: ");
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
if (release_pages(area_dst))
|
if (uffd_test_ops->release_pages(area_dst))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (userfaultfd_open(0) < 0)
|
if (userfaultfd_open(0) < 0)
|
||||||
|
@ -686,7 +701,7 @@ static int userfaultfd_zeropage_test(void)
|
||||||
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
|
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
|
||||||
fprintf(stderr, "register failure\n"), exit(1);
|
fprintf(stderr, "register failure\n"), exit(1);
|
||||||
|
|
||||||
expected_ioctls = EXPECTED_IOCTLS;
|
expected_ioctls = uffd_test_ops->expected_ioctls;
|
||||||
if ((uffdio_register.ioctls & expected_ioctls) !=
|
if ((uffdio_register.ioctls & expected_ioctls) !=
|
||||||
expected_ioctls)
|
expected_ioctls)
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
|
@ -716,7 +731,7 @@ static int userfaultfd_events_test(void)
|
||||||
printf("testing events (fork, remap, remove): ");
|
printf("testing events (fork, remap, remove): ");
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
if (release_pages(area_dst))
|
if (uffd_test_ops->release_pages(area_dst))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
|
features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
|
||||||
|
@ -731,7 +746,7 @@ static int userfaultfd_events_test(void)
|
||||||
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
|
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
|
||||||
fprintf(stderr, "register failure\n"), exit(1);
|
fprintf(stderr, "register failure\n"), exit(1);
|
||||||
|
|
||||||
expected_ioctls = EXPECTED_IOCTLS;
|
expected_ioctls = uffd_test_ops->expected_ioctls;
|
||||||
if ((uffdio_register.ioctls & expected_ioctls) !=
|
if ((uffdio_register.ioctls & expected_ioctls) !=
|
||||||
expected_ioctls)
|
expected_ioctls)
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
|
@ -773,10 +788,10 @@ static int userfaultfd_stress(void)
|
||||||
int err;
|
int err;
|
||||||
unsigned long userfaults[nr_cpus];
|
unsigned long userfaults[nr_cpus];
|
||||||
|
|
||||||
allocate_area((void **)&area_src);
|
uffd_test_ops->allocate_area((void **)&area_src);
|
||||||
if (!area_src)
|
if (!area_src)
|
||||||
return 1;
|
return 1;
|
||||||
allocate_area((void **)&area_dst);
|
uffd_test_ops->allocate_area((void **)&area_dst);
|
||||||
if (!area_dst)
|
if (!area_dst)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
|
@ -856,7 +871,7 @@ static int userfaultfd_stress(void)
|
||||||
fprintf(stderr, "register failure\n");
|
fprintf(stderr, "register failure\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
expected_ioctls = EXPECTED_IOCTLS;
|
expected_ioctls = uffd_test_ops->expected_ioctls;
|
||||||
if ((uffdio_register.ioctls & expected_ioctls) !=
|
if ((uffdio_register.ioctls & expected_ioctls) !=
|
||||||
expected_ioctls) {
|
expected_ioctls) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
|
@ -888,7 +903,7 @@ static int userfaultfd_stress(void)
|
||||||
* MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
|
* MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
|
||||||
* required to MADV_DONTNEED here.
|
* required to MADV_DONTNEED here.
|
||||||
*/
|
*/
|
||||||
if (release_pages(area_dst))
|
if (uffd_test_ops->release_pages(area_dst))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
/* bounce pass */
|
/* bounce pass */
|
||||||
|
@ -934,36 +949,6 @@ static int userfaultfd_stress(void)
|
||||||
return userfaultfd_zeropage_test() || userfaultfd_events_test();
|
return userfaultfd_zeropage_test() || userfaultfd_events_test();
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef HUGETLB_TEST
|
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
|
||||||
{
|
|
||||||
if (argc < 3)
|
|
||||||
fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
|
|
||||||
nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
|
|
||||||
page_size = sysconf(_SC_PAGE_SIZE);
|
|
||||||
if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
|
|
||||||
> page_size)
|
|
||||||
fprintf(stderr, "Impossible to run this test\n"), exit(2);
|
|
||||||
nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
|
|
||||||
nr_cpus;
|
|
||||||
if (!nr_pages_per_cpu) {
|
|
||||||
fprintf(stderr, "invalid MiB\n");
|
|
||||||
fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
|
|
||||||
}
|
|
||||||
bounces = atoi(argv[2]);
|
|
||||||
if (bounces <= 0) {
|
|
||||||
fprintf(stderr, "invalid bounces\n");
|
|
||||||
fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
|
|
||||||
}
|
|
||||||
nr_pages = nr_pages_per_cpu * nr_cpus;
|
|
||||||
printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
|
|
||||||
nr_pages, nr_pages_per_cpu);
|
|
||||||
return userfaultfd_stress();
|
|
||||||
}
|
|
||||||
|
|
||||||
#else /* HUGETLB_TEST */
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copied from mlock2-tests.c
|
* Copied from mlock2-tests.c
|
||||||
*/
|
*/
|
||||||
|
@ -988,48 +973,78 @@ unsigned long default_huge_page_size(void)
|
||||||
return hps;
|
return hps;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
static void set_test_type(const char *type)
|
||||||
{
|
{
|
||||||
if (argc < 4)
|
if (!strcmp(type, "anon")) {
|
||||||
fprintf(stderr, "Usage: <MiB> <bounces> <hugetlbfs_file>\n"),
|
test_type = TEST_ANON;
|
||||||
exit(1);
|
uffd_test_ops = &anon_uffd_test_ops;
|
||||||
nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
|
} else if (!strcmp(type, "hugetlb")) {
|
||||||
page_size = default_huge_page_size();
|
test_type = TEST_HUGETLB;
|
||||||
|
uffd_test_ops = &hugetlb_uffd_test_ops;
|
||||||
|
} else if (!strcmp(type, "shmem")) {
|
||||||
|
test_type = TEST_SHMEM;
|
||||||
|
uffd_test_ops = &shmem_uffd_test_ops;
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "Unknown test type: %s\n", type), exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (test_type == TEST_HUGETLB)
|
||||||
|
page_size = default_huge_page_size();
|
||||||
|
else
|
||||||
|
page_size = sysconf(_SC_PAGE_SIZE);
|
||||||
|
|
||||||
if (!page_size)
|
if (!page_size)
|
||||||
fprintf(stderr, "Unable to determine huge page size\n"),
|
fprintf(stderr, "Unable to determine page size\n"),
|
||||||
exit(2);
|
exit(2);
|
||||||
if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
|
if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
|
||||||
> page_size)
|
> page_size)
|
||||||
fprintf(stderr, "Impossible to run this test\n"), exit(2);
|
fprintf(stderr, "Impossible to run this test\n"), exit(2);
|
||||||
nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
if (argc < 4)
|
||||||
|
fprintf(stderr, "Usage: <test type> <MiB> <bounces> [hugetlbfs_file]\n"),
|
||||||
|
exit(1);
|
||||||
|
|
||||||
|
set_test_type(argv[1]);
|
||||||
|
|
||||||
|
nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
|
||||||
|
nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
|
||||||
nr_cpus;
|
nr_cpus;
|
||||||
if (!nr_pages_per_cpu) {
|
if (!nr_pages_per_cpu) {
|
||||||
fprintf(stderr, "invalid MiB\n");
|
fprintf(stderr, "invalid MiB\n");
|
||||||
fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
|
fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
|
||||||
}
|
}
|
||||||
bounces = atoi(argv[2]);
|
|
||||||
|
bounces = atoi(argv[3]);
|
||||||
if (bounces <= 0) {
|
if (bounces <= 0) {
|
||||||
fprintf(stderr, "invalid bounces\n");
|
fprintf(stderr, "invalid bounces\n");
|
||||||
fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
|
fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
|
||||||
}
|
}
|
||||||
nr_pages = nr_pages_per_cpu * nr_cpus;
|
nr_pages = nr_pages_per_cpu * nr_cpus;
|
||||||
huge_fd = open(argv[3], O_CREAT | O_RDWR, 0755);
|
|
||||||
if (huge_fd < 0) {
|
if (test_type == TEST_HUGETLB) {
|
||||||
fprintf(stderr, "Open of %s failed", argv[3]);
|
if (argc < 5)
|
||||||
perror("open");
|
fprintf(stderr, "Usage: hugetlb <MiB> <bounces> <hugetlbfs_file>\n"),
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
|
||||||
if (ftruncate(huge_fd, 0)) {
|
if (huge_fd < 0) {
|
||||||
fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
|
fprintf(stderr, "Open of %s failed", argv[3]);
|
||||||
perror("ftruncate");
|
perror("open");
|
||||||
exit(1);
|
exit(1);
|
||||||
|
}
|
||||||
|
if (ftruncate(huge_fd, 0)) {
|
||||||
|
fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
|
||||||
|
perror("ftruncate");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
|
printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
|
||||||
nr_pages, nr_pages_per_cpu);
|
nr_pages, nr_pages_per_cpu);
|
||||||
return userfaultfd_stress();
|
return userfaultfd_stress();
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
|
||||||
#else /* __NR_userfaultfd */
|
#else /* __NR_userfaultfd */
|
||||||
|
|
||||||
#warning "missing __NR_userfaultfd definition"
|
#warning "missing __NR_userfaultfd definition"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue