[RFC PATCH v2 01/20] mm: Provide pagesize to pmd_populate()
From: Christophe Leroy
Date: Fri May 17 2024 - 15:00:34 EST
Unlike many architectures, powerpc 8xx hardware tablewalk requires
a two level process for all page sizes, allthough second level only
has one entry when pagesize is 8M.
To fit with Linux page table topology and without requiring special
page directory layout like hugepd, the page entry will be replicated
1024 times in the standard page table. However for large pages it is
necessary to set bits in the level-1 (PMD) entry. At the time being,
for 512k pages the flag is kept in the PTE and inserted in the PMD
entry at TLB miss exception, that is necessary because we can have
pages of different sizes in a page table. However the 12 PTE bits are
fully used and there is no room for an additional bit for page size.
For 8M pages, there will be only one page per PMD entry, it is
therefore possible to flag the pagesize in the PMD entry, with the
advantage that the information will already be at the right place for
the hardware.
To do so, add a new helper called pmd_populate_size() which takes the
page size as an additional argument, and modify __pte_alloc() to also
take that argument. pte_alloc() is left unmodified in order to
reduce churn on callers, and a pte_alloc_size() is added for use by
pte_alloc_huge().
When an architecture doesn't provide pmd_populate_size(),
pmd_populate() is used as a fallback.
Signed-off-by: Christophe Leroy <christophe.leroy@xxxxxxxxxx>
---
include/linux/mm.h | 12 +++++++-----
mm/filemap.c | 2 +-
mm/internal.h | 2 +-
mm/memory.c | 19 ++++++++++++-------
mm/pgalloc-track.h | 2 +-
mm/userfaultfd.c | 4 ++--
6 files changed, 24 insertions(+), 17 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b6bdaa18b9e9..158cb87bc604 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2803,8 +2803,8 @@ static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
#endif
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd);
-int __pte_alloc_kernel(pmd_t *pmd);
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long sz);
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long sz);
#if defined(CONFIG_MMU)
@@ -2989,7 +2989,8 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
pte_unmap(pte); \
} while (0)
-#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd))
+#define pte_alloc_size(mm, pmd, sz) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd, sz))
+#define pte_alloc(mm, pmd) pte_alloc_size(mm, pmd, PAGE_SIZE)
#define pte_alloc_map(mm, pmd, address) \
(pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address))
@@ -2998,9 +2999,10 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
(pte_alloc(mm, pmd) ? \
NULL : pte_offset_map_lock(mm, pmd, address, ptlp))
-#define pte_alloc_kernel(pmd, address) \
- ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
+#define pte_alloc_kernel_size(pmd, address, sz) \
+ ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, sz))? \
NULL: pte_offset_kernel(pmd, address))
+#define pte_alloc_kernel(pmd, address) pte_alloc_kernel_size(pmd, address, PAGE_SIZE)
#if USE_SPLIT_PMD_PTLOCKS
diff --git a/mm/filemap.c b/mm/filemap.c
index 30de18c4fd28..5a783063d1f6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3428,7 +3428,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
}
if (pmd_none(*vmf->pmd) && vmf->prealloc_pte)
- pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
+ pmd_install(mm, vmf->pmd, &vmf->prealloc_pte, PAGE_SIZE);
return false;
}
diff --git a/mm/internal.h b/mm/internal.h
index 07ad2675a88b..4a01bbf55264 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -206,7 +206,7 @@ void folio_activate(struct folio *folio);
void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *start_vma, unsigned long floor,
unsigned long ceiling, bool mm_wr_locked);
-void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
+void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte, unsigned long sz);
struct zap_details;
void unmap_page_range(struct mmu_gather *tlb,
diff --git a/mm/memory.c b/mm/memory.c
index d2155ced45f8..2a9eba13a95f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -409,7 +409,12 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
} while (vma);
}
-void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
+#ifndef pmd_populate_size
+#define pmd_populate_size(mm, pmdp, pte, sz) pmd_populate(mm, pmdp, pte)
+#define pmd_populate_kernel_size(mm, pmdp, pte, sz) pmd_populate_kernel(mm, pmdp, pte)
+#endif
+
+void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte, unsigned long sz)
{
spinlock_t *ptl = pmd_lock(mm, pmd);
@@ -429,25 +434,25 @@ void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
* smp_rmb() barriers in page table walking code.
*/
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
- pmd_populate(mm, pmd, *pte);
+ pmd_populate_size(mm, pmd, *pte, sz);
*pte = NULL;
}
spin_unlock(ptl);
}
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long sz)
{
pgtable_t new = pte_alloc_one(mm);
if (!new)
return -ENOMEM;
- pmd_install(mm, pmd, &new);
+ pmd_install(mm, pmd, &new, sz);
if (new)
pte_free(mm, new);
return 0;
}
-int __pte_alloc_kernel(pmd_t *pmd)
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long sz)
{
pte_t *new = pte_alloc_one_kernel(&init_mm);
if (!new)
@@ -456,7 +461,7 @@ int __pte_alloc_kernel(pmd_t *pmd)
spin_lock(&init_mm.page_table_lock);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
smp_wmb(); /* See comment in pmd_install() */
- pmd_populate_kernel(&init_mm, pmd, new);
+ pmd_populate_kernel_size(&init_mm, pmd, new, sz);
new = NULL;
}
spin_unlock(&init_mm.page_table_lock);
@@ -4740,7 +4745,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
}
if (vmf->prealloc_pte)
- pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
+ pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte, PAGE_SIZE);
else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
return VM_FAULT_OOM;
}
diff --git a/mm/pgalloc-track.h b/mm/pgalloc-track.h
index e9e879de8649..90e37de7ab77 100644
--- a/mm/pgalloc-track.h
+++ b/mm/pgalloc-track.h
@@ -45,7 +45,7 @@ static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
#define pte_alloc_kernel_track(pmd, address, mask) \
((unlikely(pmd_none(*(pmd))) && \
- (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
+ (__pte_alloc_kernel(pmd, PAGE_SIZE) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
NULL: pte_offset_kernel(pmd, address))
#endif /* _LINUX_PGALLOC_TRACK_H */
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 3c3539c573e7..0f129d5c5aa2 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -764,7 +764,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
break;
}
if (unlikely(pmd_none(dst_pmdval)) &&
- unlikely(__pte_alloc(dst_mm, dst_pmd))) {
+ unlikely(__pte_alloc(dst_mm, dst_pmd, PAGE_SIZE))) {
err = -ENOMEM;
break;
}
@@ -1687,7 +1687,7 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
err = -ENOENT;
break;
}
- if (unlikely(__pte_alloc(mm, src_pmd))) {
+ if (unlikely(__pte_alloc(mm, src_pmd, PAGE_SIZE))) {
err = -ENOMEM;
break;
}
--
2.44.0