On 23.07.2019 0:32, Joel Fernandes (Google) wrote:
The page_idle tracking feature currently requires looking up the pagemap
for a process followed by interacting with /sys/kernel/mm/page_idle.
This is quite cumbersome and can be error-prone too. If between
accessing the per-PID pagemap and the global page_idle bitmap, if
something changes with the page then the information is not accurate.
More over looking up PFN from pagemap in Android devices is not
supported by unprivileged process and requires SYS_ADMIN and gives 0 for
the PFN.
This patch adds support to directly interact with page_idle tracking at
the PID level by introducing a /proc/<pid>/page_idle file. This
eliminates the need for userspace to calculate the mapping of the page.
It follows the exact same semantics as the global
/sys/kernel/mm/page_idle, however it is easier to use for some usecases
where looking up PFN is not needed and also does not require SYS_ADMIN.
It ended up simplifying userspace code, solving the security issue
mentioned and works quite well. SELinux does not need to be turned off
since no pagemap look up is needed.
In Android, we are using this for the heap profiler (heapprofd) which
profiles and pin points code paths which allocates and leaves memory
idle for long periods of time.
Documentation material:
The idle page tracking API for virtual address indexing using virtual page
frame numbers (VFN) is located at /proc/<pid>/page_idle. It is a bitmap
that follows the same semantics as /sys/kernel/mm/page_idle/bitmap
except that it uses virtual instead of physical frame numbers.
This idle page tracking API can be simpler to use than physical address
indexing, since the pagemap for a process does not need to be looked up
to mark or read a page's idle bit. It is also more accurate than
physical address indexing since in physical address indexing, address
space changes can occur between reading the pagemap and reading the
bitmap. In virtual address indexing, the process's mmap_sem is held for
the duration of the access.
Maybe integrate this into existing interface: /proc/pid/clear_refs and
/proc/pid/pagemap ?
I.e. echo X > /proc/pid/clear_refs clears reference bits in ptes and
marks pages idle only for pages mapped in this process.
And idle bit in /proc/pid/pagemap tells that page is still idle in this process.
This is faster - we don't need to walk whole rmap for that.
Cc: vdavydov.dev@xxxxxxxxx
Cc: Brendan Gregg <bgregg@xxxxxxxxxxx>
Cc: kernel-team@xxxxxxxxxxx
Signed-off-by: Joel Fernandes (Google) <joel@xxxxxxxxxxxxxxxxx>
---
Internal review -> v1:
Fixes from Suren.
Corrections to change log, docs (Florian, Sandeep)
 fs/proc/base.c | 3 +
 fs/proc/internal.h | 1 +
 fs/proc/task_mmu.c | 57 +++++++
 include/linux/page_idle.h | 4 +
 mm/page_idle.c | 305 +++++++++++++++++++++++++++++++++-----
 5 files changed, 330 insertions(+), 40 deletions(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 77eb628ecc7f..a58dd74606e9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3021,6 +3021,9 @@ static const struct pid_entry tgid_base_stuff[] = {
ÂÂÂÂÂ REG("smaps",ÂÂÂÂÂ S_IRUGO, proc_pid_smaps_operations),
ÂÂÂÂÂ REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
ÂÂÂÂÂ REG("pagemap",ÂÂÂ S_IRUSR, proc_pagemap_operations),
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+ÂÂÂ REG("page_idle", S_IRUSR|S_IWUSR, proc_page_idle_operations),
+#endif
 #endif
 #ifdef CONFIG_SECURITY
ÂÂÂÂÂ DIR("attr",ÂÂÂÂÂÂ S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index cd0c8d5ce9a1..bc9371880c63 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -293,6 +293,7 @@ extern const struct file_operations proc_pid_smaps_operations;
 extern const struct file_operations proc_pid_smaps_rollup_operations;
 extern const struct file_operations proc_clear_refs_operations;
 extern const struct file_operations proc_pagemap_operations;
+extern const struct file_operations proc_page_idle_operations;
 extern unsigned long task_vsize(struct mm_struct *);
 extern unsigned long task_statm(struct mm_struct *,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4d2b860dbc3f..11ccc53da38e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1642,6 +1642,63 @@ const struct file_operations proc_pagemap_operations = {
ÂÂÂÂÂ .openÂÂÂÂÂÂÂ = pagemap_open,
ÂÂÂÂÂ .releaseÂÂÂ = pagemap_release,
 };
+
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+static ssize_t proc_page_idle_read(struct file *file, char __user *buf,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ size_t count, loff_t *ppos)
+{
+ÂÂÂ int ret;
+ÂÂÂ struct task_struct *tsk = get_proc_task(file_inode(file));
+
+ÂÂÂ if (!tsk)
+ÂÂÂÂÂÂÂ return -EINVAL;
+ÂÂÂ ret = page_idle_proc_read(file, buf, count, ppos, tsk);
+ÂÂÂ put_task_struct(tsk);
+ÂÂÂ return ret;
+}
+
+static ssize_t proc_page_idle_write(struct file *file, const char __user *buf,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ size_t count, loff_t *ppos)
+{
+ÂÂÂ int ret;
+ÂÂÂ struct task_struct *tsk = get_proc_task(file_inode(file));
+
+ÂÂÂ if (!tsk)
+ÂÂÂÂÂÂÂ return -EINVAL;
+ÂÂÂ ret = page_idle_proc_write(file, (char __user *)buf, count, ppos, tsk);
+ÂÂÂ put_task_struct(tsk);
+ÂÂÂ return ret;
+}
+
+static int proc_page_idle_open(struct inode *inode, struct file *file)
+{
+ÂÂÂ struct mm_struct *mm;
+
+ÂÂÂ mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ÂÂÂ if (IS_ERR(mm))
+ÂÂÂÂÂÂÂ return PTR_ERR(mm);
+ÂÂÂ file->private_data = mm;
+ÂÂÂ return 0;
+}
+
+static int proc_page_idle_release(struct inode *inode, struct file *file)
+{
+ÂÂÂ struct mm_struct *mm = file->private_data;
+
+ÂÂÂ if (mm)
+ÂÂÂÂÂÂÂ mmdrop(mm);
+ÂÂÂ return 0;
+}
+
+const struct file_operations proc_page_idle_operations = {
+ÂÂÂ .llseekÂÂÂÂÂÂÂ = mem_lseek, /* borrow this */
+ÂÂÂ .readÂÂÂÂÂÂÂ = proc_page_idle_read,
+ÂÂÂ .writeÂÂÂÂÂÂÂ = proc_page_idle_write,
+ÂÂÂ .openÂÂÂÂÂÂÂ = proc_page_idle_open,
+ÂÂÂ .releaseÂÂÂ = proc_page_idle_release,
+};
+#endif /* CONFIG_IDLE_PAGE_TRACKING */
+
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 #ifdef CONFIG_NUMA
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 1e894d34bdce..f1bc2640d85e 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -106,6 +106,10 @@ static inline void clear_page_idle(struct page *page)
 }
 #endif /* CONFIG_64BIT */
+ssize_t page_idle_proc_write(struct file *file,
+ÂÂÂ char __user *buf, size_t count, loff_t *ppos, struct task_struct *tsk);
+ssize_t page_idle_proc_read(struct file *file,
+ÂÂÂ char __user *buf, size_t count, loff_t *ppos, struct task_struct *tsk);
 #else /* !CONFIG_IDLE_PAGE_TRACKING */
 static inline bool page_is_young(struct page *page)
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 295512465065..874a60c41fef 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -11,6 +11,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/page_ext.h>
 #include <linux/page_idle.h>
+#include <linux/sched/mm.h>
 #define BITMAP_CHUNK_SIZE sizeof(u64)
 #define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
@@ -28,15 +29,12 @@
ÂÂ *
ÂÂ * This function tries to get a user memory page by pfn as described above.
ÂÂ */
-static struct page *page_idle_get_page(unsigned long pfn)
+static struct page *page_idle_get_page(struct page *page_in)
 {
ÂÂÂÂÂ struct page *page;
ÂÂÂÂÂ pg_data_t *pgdat;
-ÂÂÂ if (!pfn_valid(pfn))
-ÂÂÂÂÂÂÂ return NULL;
-
-ÂÂÂ page = pfn_to_page(pfn);
+ÂÂÂ page = page_in;
ÂÂÂÂÂ if (!page || !PageLRU(page) ||
ÂÂÂÂÂÂÂÂÂ !get_page_unless_zero(page))
ÂÂÂÂÂÂÂÂÂ return NULL;
@@ -51,6 +49,15 @@ static struct page *page_idle_get_page(unsigned long pfn)
ÂÂÂÂÂ return page;
 }
+static struct page *page_idle_get_page_pfn(unsigned long pfn)
+{
+
+ÂÂÂ if (!pfn_valid(pfn))
+ÂÂÂÂÂÂÂ return NULL;
+
+ÂÂÂ return page_idle_get_page(pfn_to_page(pfn));
+}
+
 static bool page_idle_clear_pte_refs_one(struct page *page,
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ struct vm_area_struct *vma,
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ unsigned long addr, void *arg)
@@ -118,6 +125,47 @@ static void page_idle_clear_pte_refs(struct page *page)
ÂÂÂÂÂÂÂÂÂ unlock_page(page);
 }
+/* Helper to get the start and end frame given a pos and count */
+static int page_idle_get_frames(loff_t pos, size_t count, struct mm_struct *mm,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ unsigned long *start, unsigned long *end)
+{
+ÂÂÂ unsigned long max_frame;
+
+ÂÂÂ /* If an mm is not given, assume we want physical frames */
+ÂÂÂ max_frame = mm ? (mm->task_size >> PAGE_SHIFT) : max_pfn;
+
+ÂÂÂ if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+ÂÂÂÂÂÂÂ return -EINVAL;
+
+ÂÂÂ *start = pos * BITS_PER_BYTE;
+ÂÂÂ if (*start >= max_frame)
+ÂÂÂÂÂÂÂ return -ENXIO;
+
+ÂÂÂ *end = *start + count * BITS_PER_BYTE;
+ÂÂÂ if (*end > max_frame)
+ÂÂÂÂÂÂÂ *end = max_frame;
+ÂÂÂ return 0;
+}
+
+static bool page_really_idle(struct page *page)
+{
+ÂÂÂ if (!page)
+ÂÂÂÂÂÂÂ return false;
+
+ÂÂÂ if (page_is_idle(page)) {
+ÂÂÂÂÂÂÂ /*
+ÂÂÂÂÂÂÂÂ * The page might have been referenced via a
+ÂÂÂÂÂÂÂÂ * pte, in which case it is not idle. Clear
+ÂÂÂÂÂÂÂÂ * refs and recheck.
+ÂÂÂÂÂÂÂÂ */
+ÂÂÂÂÂÂÂ page_idle_clear_pte_refs(page);
+ÂÂÂÂÂÂÂ if (page_is_idle(page))
+ÂÂÂÂÂÂÂÂÂÂÂ return true;
+ÂÂÂ }
+
+ÂÂÂ return false;
+}
+
 static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ struct bin_attribute *attr, char *buf,
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ loff_t pos, size_t count)
@@ -125,35 +173,21 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
ÂÂÂÂÂ u64 *out = (u64 *)buf;
ÂÂÂÂÂ struct page *page;
ÂÂÂÂÂ unsigned long pfn, end_pfn;
-ÂÂÂ int bit;
-
-ÂÂÂ if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
-ÂÂÂÂÂÂÂ return -EINVAL;
-
-ÂÂÂ pfn = pos * BITS_PER_BYTE;
-ÂÂÂ if (pfn >= max_pfn)
-ÂÂÂÂÂÂÂ return 0;
+ÂÂÂ int bit, ret;
-ÂÂÂ end_pfn = pfn + count * BITS_PER_BYTE;
-ÂÂÂ if (end_pfn > max_pfn)
-ÂÂÂÂÂÂÂ end_pfn = max_pfn;
+ÂÂÂ ret = page_idle_get_frames(pos, count, NULL, &pfn, &end_pfn);
+ÂÂÂ if (ret == -ENXIO)
+ÂÂÂÂÂÂÂ return 0;Â /* Reads beyond max_pfn do nothing */
+ÂÂÂ else if (ret)
+ÂÂÂÂÂÂÂ return ret;
ÂÂÂÂÂ for (; pfn < end_pfn; pfn++) {
ÂÂÂÂÂÂÂÂÂ bit = pfn % BITMAP_CHUNK_BITS;
ÂÂÂÂÂÂÂÂÂ if (!bit)
ÂÂÂÂÂÂÂÂÂÂÂÂÂ *out = 0ULL;
-ÂÂÂÂÂÂÂ page = page_idle_get_page(pfn);
-ÂÂÂÂÂÂÂ if (page) {
-ÂÂÂÂÂÂÂÂÂÂÂ if (page_is_idle(page)) {
-ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ /*
-ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ * The page might have been referenced via a
-ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ * pte, in which case it is not idle. Clear
-ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ * refs and recheck.
-ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ */
-ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ page_idle_clear_pte_refs(page);
-ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ if (page_is_idle(page))
-ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ *out |= 1ULL << bit;
-ÂÂÂÂÂÂÂÂÂÂÂ }
+ÂÂÂÂÂÂÂ page = page_idle_get_page_pfn(pfn);
+ÂÂÂÂÂÂÂ if (page && page_really_idle(page)) {
+ÂÂÂÂÂÂÂÂÂÂÂ *out |= 1ULL << bit;
ÂÂÂÂÂÂÂÂÂÂÂÂÂ put_page(page);
ÂÂÂÂÂÂÂÂÂ }
ÂÂÂÂÂÂÂÂÂ if (bit == BITMAP_CHUNK_BITS - 1)
@@ -170,23 +204,16 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
ÂÂÂÂÂ const u64 *in = (u64 *)buf;
ÂÂÂÂÂ struct page *page;
ÂÂÂÂÂ unsigned long pfn, end_pfn;
-ÂÂÂ int bit;
+ÂÂÂ int bit, ret;
-ÂÂÂ if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
-ÂÂÂÂÂÂÂ return -EINVAL;
-
-ÂÂÂ pfn = pos * BITS_PER_BYTE;
-ÂÂÂ if (pfn >= max_pfn)
-ÂÂÂÂÂÂÂ return -ENXIO;
-
-ÂÂÂ end_pfn = pfn + count * BITS_PER_BYTE;
-ÂÂÂ if (end_pfn > max_pfn)
-ÂÂÂÂÂÂÂ end_pfn = max_pfn;
+ÂÂÂ ret = page_idle_get_frames(pos, count, NULL, &pfn, &end_pfn);
+ÂÂÂ if (ret)
+ÂÂÂÂÂÂÂ return ret;
ÂÂÂÂÂ for (; pfn < end_pfn; pfn++) {
ÂÂÂÂÂÂÂÂÂ bit = pfn % BITMAP_CHUNK_BITS;
ÂÂÂÂÂÂÂÂÂ if ((*in >> bit) & 1) {
-ÂÂÂÂÂÂÂÂÂÂÂ page = page_idle_get_page(pfn);
+ÂÂÂÂÂÂÂÂÂÂÂ page = page_idle_get_page_pfn(pfn);
ÂÂÂÂÂÂÂÂÂÂÂÂÂ if (page) {
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ page_idle_clear_pte_refs(page);
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ set_page_idle(page);
@@ -224,10 +251,208 @@ struct page_ext_operations page_idle_ops = {
 };
 #endif
+/*Â page_idle tracking for /proc/<pid>/page_idle */
+
+static DEFINE_SPINLOCK(idle_page_list_lock);
+struct list_head idle_page_list;
+
+struct page_node {
+ÂÂÂ struct page *page;
+ÂÂÂ unsigned long addr;
+ÂÂÂ struct list_head list;
+};
+
+struct page_idle_proc_priv {
+ÂÂÂ unsigned long start_addr;
+ÂÂÂ char *buffer;
+ÂÂÂ int write;
+};
+
+static void add_page_idle_list(struct page *page,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ unsigned long addr, struct mm_walk *walk)
+{
+ÂÂÂ struct page *page_get;
+ÂÂÂ struct page_node *pn;
+ÂÂÂ int bit;
+ÂÂÂ unsigned long frames;
+ÂÂÂ struct page_idle_proc_priv *priv = walk->private;
+ÂÂÂ u64 *chunk = (u64 *)priv->buffer;
+
+ÂÂÂ if (priv->write) {
+ÂÂÂÂÂÂÂ /* Find whether this page was asked to be marked */
+ÂÂÂÂÂÂÂ frames = (addr - priv->start_addr) >> PAGE_SHIFT;
+ÂÂÂÂÂÂÂ bit = frames % BITMAP_CHUNK_BITS;
+ÂÂÂÂÂÂÂ chunk = &chunk[frames / BITMAP_CHUNK_BITS];
+ÂÂÂÂÂÂÂ if (((*chunk >> bit) & 1) == 0)
+ÂÂÂÂÂÂÂÂÂÂÂ return;
+ÂÂÂ }
+
+ÂÂÂ page_get = page_idle_get_page(page);
+ÂÂÂ if (!page_get)
+ÂÂÂÂÂÂÂ return;
+
+ÂÂÂ pn = kmalloc(sizeof(*pn), GFP_ATOMIC);
+ÂÂÂ if (!pn)
+ÂÂÂÂÂÂÂ return;
+
+ÂÂÂ pn->page = page_get;
+ÂÂÂ pn->addr = addr;
+ÂÂÂ list_add(&pn->list, &idle_page_list);
+}
+
+static int pte_page_idle_proc_range(pmd_t *pmd, unsigned long addr,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ unsigned long end,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ struct mm_walk *walk)
+{
+ÂÂÂ struct vm_area_struct *vma = walk->vma;
+ÂÂÂ pte_t *pte;
+ÂÂÂ spinlock_t *ptl;
+ÂÂÂ struct page *page;
+
+ÂÂÂ ptl = pmd_trans_huge_lock(pmd, vma);
+ÂÂÂ if (ptl) {
+ÂÂÂÂÂÂÂ if (pmd_present(*pmd)) {
+ÂÂÂÂÂÂÂÂÂÂÂ page = follow_trans_huge_pmd(vma, addr, pmd,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ FOLL_DUMP|FOLL_WRITE);
+ÂÂÂÂÂÂÂÂÂÂÂ if (!IS_ERR_OR_NULL(page))
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ add_page_idle_list(page, addr, walk);
+ÂÂÂÂÂÂÂ }
+ÂÂÂÂÂÂÂ spin_unlock(ptl);
+ÂÂÂÂÂÂÂ return 0;
+ÂÂÂ }
+
+ÂÂÂ if (pmd_trans_unstable(pmd))
+ÂÂÂÂÂÂÂ return 0;
+
+ÂÂÂ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ÂÂÂ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ÂÂÂÂÂÂÂ if (!pte_present(*pte))
+ÂÂÂÂÂÂÂÂÂÂÂ continue;
+
+ÂÂÂÂÂÂÂ page = vm_normal_page(vma, addr, *pte);
+ÂÂÂÂÂÂÂ if (page)
+ÂÂÂÂÂÂÂÂÂÂÂ add_page_idle_list(page, addr, walk);
+ÂÂÂ }
+
+ÂÂÂ pte_unmap_unlock(pte - 1, ptl);
+ÂÂÂ return 0;
+}
+
+ssize_t page_idle_proc_generic(struct file *file, char __user *ubuff,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ size_t count, loff_t *pos,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ struct task_struct *tsk, int write)
+{
+ÂÂÂ int ret;
+ÂÂÂ char *buffer;
+ÂÂÂ u64 *out;
+ÂÂÂ unsigned long start_addr, end_addr, start_frame, end_frame;
+ÂÂÂ struct mm_struct *mm = file->private_data;
+ÂÂÂ struct mm_walk walk = { .pmd_entry = pte_page_idle_proc_range, };
+ÂÂÂ struct page_node *cur, *next;
+ÂÂÂ struct page_idle_proc_priv priv;
+ÂÂÂ bool walk_error = false;
+
+ÂÂÂ if (!mm || !mmget_not_zero(mm))
+ÂÂÂÂÂÂÂ return -EINVAL;
+
+ÂÂÂ if (count > PAGE_SIZE)
+ÂÂÂÂÂÂÂ count = PAGE_SIZE;
+
+ÂÂÂ buffer = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ÂÂÂ if (!buffer) {
+ÂÂÂÂÂÂÂ ret = -ENOMEM;
+ÂÂÂÂÂÂÂ goto out_mmput;
+ÂÂÂ }
+ÂÂÂ out = (u64 *)buffer;
+
+ÂÂÂ if (write && copy_from_user(buffer, ubuff, count)) {
+ÂÂÂÂÂÂÂ ret = -EFAULT;
+ÂÂÂÂÂÂÂ goto out;
+ÂÂÂ }
+
+ÂÂÂ ret = page_idle_get_frames(*pos, count, mm, &start_frame, &end_frame);
+ÂÂÂ if (ret)
+ÂÂÂÂÂÂÂ goto out;
+
+ÂÂÂ start_addr = (start_frame << PAGE_SHIFT);
+ÂÂÂ end_addr = (end_frame << PAGE_SHIFT);
+ÂÂÂ priv.buffer = buffer;
+ÂÂÂ priv.start_addr = start_addr;
+ÂÂÂ priv.write = write;
+ÂÂÂ walk.private = &priv;
+ÂÂÂ walk.mm = mm;
+
+ÂÂÂ down_read(&mm->mmap_sem);
+
+ÂÂÂ /*
+ÂÂÂÂ * Protects the idle_page_list which is needed because
+ÂÂÂÂ * walk_page_vma() holds ptlock which deadlocks with
+ÂÂÂÂ * page_idle_clear_pte_refs(). So we have to collect all
+ÂÂÂÂ * pages first, and then call page_idle_clear_pte_refs().
+ÂÂÂÂ */
+ÂÂÂ spin_lock(&idle_page_list_lock);
+ÂÂÂ ret = walk_page_range(start_addr, end_addr, &walk);
+ÂÂÂ if (ret)
+ÂÂÂÂÂÂÂ walk_error = true;
+
+ÂÂÂ list_for_each_entry_safe(cur, next, &idle_page_list, list) {
+ÂÂÂÂÂÂÂ int bit, index;
+ÂÂÂÂÂÂÂ unsigned long off;
+ÂÂÂÂÂÂÂ struct page *page = cur->page;
+
+ÂÂÂÂÂÂÂ if (unlikely(walk_error))
+ÂÂÂÂÂÂÂÂÂÂÂ goto remove_page;
+
+ÂÂÂÂÂÂÂ if (write) {
+ÂÂÂÂÂÂÂÂÂÂÂ page_idle_clear_pte_refs(page);
+ÂÂÂÂÂÂÂÂÂÂÂ set_page_idle(page);
+ÂÂÂÂÂÂÂ } else {
+ÂÂÂÂÂÂÂÂÂÂÂ if (page_really_idle(page)) {
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ off = ((cur->addr) >> PAGE_SHIFT) - start_frame;
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ bit = off % BITMAP_CHUNK_BITS;
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ index = off / BITMAP_CHUNK_BITS;
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ out[index] |= 1ULL << bit;
+ÂÂÂÂÂÂÂÂÂÂÂ }
+ÂÂÂÂÂÂÂ }
+remove_page:
+ÂÂÂÂÂÂÂ put_page(page);
+ÂÂÂÂÂÂÂ list_del(&cur->list);
+ÂÂÂÂÂÂÂ kfree(cur);
+ÂÂÂ }
+ÂÂÂ spin_unlock(&idle_page_list_lock);
+
+ÂÂÂ if (!write && !walk_error)
+ÂÂÂÂÂÂÂ ret = copy_to_user(ubuff, buffer, count);
+
+ÂÂÂ up_read(&mm->mmap_sem);
+out:
+ÂÂÂ kfree(buffer);
+out_mmput:
+ÂÂÂ mmput(mm);
+ÂÂÂ if (!ret)
+ÂÂÂÂÂÂÂ ret = count;
+ÂÂÂ return ret;
+
+}
+
+ssize_t page_idle_proc_read(struct file *file, char __user *ubuff,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ size_t count, loff_t *pos, struct task_struct *tsk)
+{
+ÂÂÂ return page_idle_proc_generic(file, ubuff, count, pos, tsk, 0);
+}
+
+ssize_t page_idle_proc_write(struct file *file, char __user *ubuff,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ size_t count, loff_t *pos, struct task_struct *tsk)
+{
+ÂÂÂ return page_idle_proc_generic(file, ubuff, count, pos, tsk, 1);
+}
+
 static int __init page_idle_init(void)
 {
ÂÂÂÂÂ int err;
+ÂÂÂ INIT_LIST_HEAD(&idle_page_list);
+
ÂÂÂÂÂ err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
ÂÂÂÂÂ if (err) {
ÂÂÂÂÂÂÂÂÂ pr_err("page_idle: register sysfs failed\n");