- Rename a PKRU macro to make more sense when reading the code

- Update pkeys documentation
 
 - Avoid reading contended mm's TLB generation var if not absolutely
 necessary along with fixing a case where arch_tlbbatch_flush() doesn't
 adhere to the generation scheme and thus violates the conditions for the
 above avoidance.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmLnmpYACgkQEsHwGGHe
 VUrINQ/9FGnQya6mTJitM3Ohdzu1lOrHm5+XAxCO3SVzPPQlx0mRZmszzDOIZpG/
 9iCEDhSi+kLdkTwIXk8Nmm1imNT2MSqswjQYr8KDtl69/j12W8Y0Pb5C5tnQnUyi
 FXPiVVCAk0iegNg+QvarQa8Ou6tGWDqFMLzdrq9XNokdBmFq7FCDsOjdwd8So3IY
 95755wDtCxgBXc2TVr08qSpD0Q/VlHKqb5shtzuoBe9a0YLEaRmWne9UzTOx5U6c
 //qk8lmy9ohL8dmN7SgcRITzfpU8ue+/J4oZ+GV9mc/UTW5Ah2WNX+3BFnmCqZrK
 gr7G5pukuuJxFj8yGzGbGIM28OHKYIE+So2Q5pA6Vrqst/oyDJS+pcoxyhAYGYCQ
 hDjp4yu5AUnsPky6h6VHaR8Er5Nvo7YwhdSazcGD+HC7smwbnVEzI5H7MUgcJ05F
 1CkAQSy2TVZe0hhilOu8dcHN23+2ISF8BzxKbn4qtZOsJTN6/U4MYFWl6VPh8P80
 vjZcIJYZ4i6Gz03m7ITk2bHwfOD8f/7UkbZEggO/GYm1BgmxaMB0IogoIkSUG9vN
 CLGZomRMfBcVVS1DTWJsUzRLbNx3x3pL41NrlxPbC/rTmvts5eJAvcDcffPfRGzx
 tCqcASRdV7tQBgMT5MLjmIY8cM1aphdGSdlKVD7QHZ11bJVFZE4=
 =aD0S
 -----END PGP SIGNATURE-----

Merge tag 'x86_mm_for_v6.0_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Borislav Petkov:

 - Rename a PKRU macro to make more sense when reading the code

 - Update pkeys documentation

 - Avoid reading contended mm's TLB generation var if not absolutely
   necessary along with fixing a case where arch_tlbbatch_flush()
   doesn't adhere to the generation scheme and thus violates the
   conditions for the above avoidance.

* tag 'x86_mm_for_v6.0_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mm/tlb: Ignore f->new_tlb_gen when zero
  x86/pkeys: Clarify PKRU_AD_KEY macro
  Documentation/protection-keys: Clean up documentation for User Space pkeys
  x86/mm/tlb: Avoid reading mm_tlb_gen when possible
This commit is contained in:
Linus Torvalds 2022-08-01 09:34:39 -07:00
commit 92598ae22f
4 changed files with 56 additions and 29 deletions

View file

@ -4,31 +4,29 @@
Memory Protection Keys
======================
Memory Protection Keys for Userspace (PKU aka PKEYs) is a feature
which is found on Intel's Skylake (and later) "Scalable Processor"
Server CPUs. It will be available in future non-server Intel parts
and future AMD processors.
Memory Protection Keys provide a mechanism for enforcing page-based
protections, but without requiring modification of the page tables when an
application changes protection domains.
For anyone wishing to test or use this feature, it is available in
Amazon's EC2 C5 instances and is known to work there using an Ubuntu
17.04 image.
Pkeys Userspace (PKU) is a feature which can be found on:
* Intel server CPUs, Skylake and later
* Intel client CPUs, Tiger Lake (11th Gen Core) and later
* Future AMD CPUs
Memory Protection Keys provides a mechanism for enforcing page-based
protections, but without requiring modification of the page tables
when an application changes protection domains. It works by
dedicating 4 previously ignored bits in each page table entry to a
"protection key", giving 16 possible keys.
Pkeys work by dedicating 4 previously Reserved bits in each page table entry to
a "protection key", giving 16 possible keys.
There is also a new user-accessible register (PKRU) with two separate
bits (Access Disable and Write Disable) for each key. Being a CPU
register, PKRU is inherently thread-local, potentially giving each
Protections for each key are defined with a per-CPU user-accessible register
(PKRU). Each of these is a 32-bit register storing two bits (Access Disable
and Write Disable) for each of 16 keys.
Being a CPU register, PKRU is inherently thread-local, potentially giving each
thread a different set of protections from every other thread.
There are two new instructions (RDPKRU/WRPKRU) for reading and writing
to the new register. The feature is only available in 64-bit mode,
even though there is theoretically space in the PAE PTEs. These
permissions are enforced on data access only and have no effect on
instruction fetches.
There are two instructions (RDPKRU/WRPKRU) for reading and writing to the
register. The feature is only available in 64-bit mode, even though there is
theoretically space in the PAE PTEs. These permissions are enforced on data
access only and have no effect on instruction fetches.
Syscalls
========

View file

@ -16,6 +16,7 @@
void __flush_tlb_all(void);
#define TLB_FLUSH_ALL -1UL
#define TLB_GENERATION_INVALID 0
void cr4_update_irqsoff(unsigned long set, unsigned long clear);
unsigned long cr4_read_shadow(void);

View file

@ -110,7 +110,7 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
return vma_pkey(vma);
}
#define PKRU_AD_KEY(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY))
#define PKRU_AD_MASK(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY))
/*
* Make the default PKRU value (at execve() time) as restrictive
@ -118,11 +118,14 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
* in the process's lifetime will not accidentally get access
* to data which is pkey-protected later on.
*/
u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) |
PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) |
PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) |
PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15);
u32 init_pkru_value = PKRU_AD_MASK( 1) | PKRU_AD_MASK( 2) |
PKRU_AD_MASK( 3) | PKRU_AD_MASK( 4) |
PKRU_AD_MASK( 5) | PKRU_AD_MASK( 6) |
PKRU_AD_MASK( 7) | PKRU_AD_MASK( 8) |
PKRU_AD_MASK( 9) | PKRU_AD_MASK(10) |
PKRU_AD_MASK(11) | PKRU_AD_MASK(12) |
PKRU_AD_MASK(13) | PKRU_AD_MASK(14) |
PKRU_AD_MASK(15);
static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)

View file

@ -734,10 +734,10 @@ static void flush_tlb_func(void *info)
const struct flush_tlb_info *f = info;
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
bool local = smp_processor_id() == f->initiating_cpu;
unsigned long nr_invalidate = 0;
u64 mm_tlb_gen;
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
@ -771,6 +771,23 @@ static void flush_tlb_func(void *info)
return;
}
if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
f->new_tlb_gen <= local_tlb_gen)) {
/*
* The TLB is already up to date in respect to f->new_tlb_gen.
* While the core might be still behind mm_tlb_gen, checking
* mm_tlb_gen unnecessarily would have negative caching effects
* so avoid it.
*/
return;
}
/*
* Defer mm_tlb_gen reading as long as possible to avoid cache
* contention.
*/
mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
if (unlikely(local_tlb_gen == mm_tlb_gen)) {
/*
* There's nothing to do: we're already up to date. This can
@ -827,6 +844,12 @@ static void flush_tlb_func(void *info)
/* Partial flush */
unsigned long addr = f->start;
/* Partial flush cannot have invalid generations */
VM_WARN_ON(f->new_tlb_gen == TLB_GENERATION_INVALID);
/* Partial flush must have valid mm */
VM_WARN_ON(f->mm == NULL);
nr_invalidate = (f->end - f->start) >> f->stride_shift;
while (addr < f->end) {
@ -1029,7 +1052,8 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
struct flush_tlb_info *info;
preempt_disable();
info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
info = get_flush_tlb_info(NULL, start, end, 0, false,
TLB_GENERATION_INVALID);
on_each_cpu(do_kernel_range_flush, info, 1);
@ -1198,7 +1222,8 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
int cpu = get_cpu();
info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 0);
info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
TLB_GENERATION_INVALID);
/*
* flush_tlb_multi() is not optimized for the common case in which only
* a local TLB flush is needed. Optimize this use-case by calling