[Kernelfeatures-devel] [rfc] supermincore

Sat Nov 20 16:51:41 CST 2004

supermincore() is a new system called most directly intended to enable
the determination of a process' user memory footprint. Analogous to
mincore(2), supermincore(2) returns an array with one entry for each
page. It allows one process to carry out this operation on another
process. The amount of information provided is greater than that, so
that it may be used to determine such things as node locality, the
file offset arrangements used with remap_file_pages(2), and the
protections used with extensions to remap_file_pages(2).

The patch below is of prototype quality and has not been tested.
For "protection" of potential users who may have missed this part of
the post, this patch doesn't link the system call into syscall tables.
This post is largely meant to elicit feedback regarding the feature set
and access methods.

The kernel internal function prototype is as follows:
asmlinkage long sys_supermincore(long pid, unsigned long * __user buf,
			unsigned long start, unsigned long end);

The entries are structured as follows:

struct {
	unsigned long	pfn,
			index,
			count,
			prot;
};

->pfn is typically the physical page frame number, or on unusual
architectures (iSeries) the kernel's "virtualized" notion of the same.
->index is the offset into the backing file as measured in PAGE_SIZE
units. ->count is the number of processes mapping the page. Finally,
->prot is the set of protections used to map the page in the process
address space, using the same convention as mmap(2) with PROT_READ etc.

The specific method intended to be used for instrumenting processes'
memory fooprint is "weighted RSS". That statistic weights each page by
the reciprocal of the number of processes mapping it. It has the
property that a collection of processes sharing no user memory with
other processes has the sum of its weighted RSS's equal to the total
amount of memory used across all processes in the collection, not
including sharing. Aggregate statistics are best collected via nr_mapped
in /proc/vmstat.

Userspace instrumentation tools, testing results, and corrections of
the attached implementation will follow.


-- wli
-------------- next part --------------
Index: mm2-2.6.10-rc2/mm/supermincore.c
===================================================================

--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ mm2-2.6.10-rc2/mm/supermincore.c	2004-11-20 14:04:22.001409371 -0800
@@ -0,0 +1,159 @@
+/*
+ * mm/supermincore.c
+ * (C) Nov 2004 William Irwin, Oracle
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/mman.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <asm/uaccess.h>
+
+static long flush_buffer(unsigned long * __user * buf,
+				struct page *bufpage, unsigned long *offset)
+{
+	long left = 0, len = *offset*sizeof(long);
+	unsigned long *kbuf;
+
+	if (!fault_in_pages_writeable((char *)*buf, len)) {
+		kbuf = (unsigned long *)kmap_atomic(bufpage, KM_USER0);
+		left = __copy_to_user_inatomic(*buf, kbuf, len);
+		kunmap_atomic((char *)kbuf, KM_USER0);
+		if (!left)
+			return 0;
+	}
+	kbuf = kmap(bufpage);
+	left = __copy_to_user(*buf, kbuf, len);
+	kunmap(bufpage);
+	if (left)
+		return -EFAULT;
+	else
+		return 0;
+}
+
+static long __supermincore(struct mm_struct *mm, unsigned long * __user * buf,
+				struct page *bufpage, unsigned long vaddr,
+				unsigned long *offset)
+{
+	unsigned long pfn, idx, count, prot, *kbuf;
+	pml4_t *pml4;
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pfn = idx = count = prot = 0;
+	pml4 = pml4_offset(mm, vaddr);
+	if (pml4_none(*pml4))
+		goto output;
+	pgd = pml4_pgd_offset(pml4, vaddr);
+	if (!pgd || pgd_none(*pgd))
+		goto output;
+	pmd = pmd_offset(pgd, vaddr);
+	if (!pmd || pmd_none(*pmd))
+		goto output;
+	pte = pte_offset_map(pmd, vaddr);
+	if (!pte)
+		goto output;
+	else if (pte_none(*pte)) {
+		pte_unmap(pte);
+		goto output;
+	} else {
+		if (pte_read(*pte))
+			prot |= PROT_READ;
+		if (pte_exec(*pte))
+			prot |= PROT_EXEC;
+		if (pte_write(*pte))
+			prot |= PROT_WRITE;
+		if (pte_present(*pte)) {
+			pfn = pte_pfn(*pte);
+			if (pfn_valid(pfn)) {
+				struct page *page = pfn_to_page(pfn);
+
+				idx = page->index;
+				count = atomic_read(&page->_mapcount);
+			}
+		} else if (pte_file(*pte)) {
+			struct vm_area_struct *vma;
+			pte_t alt_pte;
+
+			if (!(vma = find_vma(mm, vaddr))
+						|| vaddr < vma->vm_start) {
+				pte_unmap(pte);
+				goto output;
+			}
+			alt_pte = pfn_pte(0, vma->vm_page_prot);
+			idx = pte_to_pgoff(*pte);
+			if (pte_read(alt_pte))
+				prot |= PROT_READ;
+			if (pte_write(alt_pte))
+				prot |= PROT_WRITE;
+			if (pte_exec(alt_pte))
+				prot |= PROT_EXEC;
+		}
+	}
+	pte_unmap(pte);
+output:
+	if (unlikely(*offset >= PAGE_SIZE/sizeof(long))) {
+		long ret = 0;
+
+		spin_unlock(&mm->page_table_lock);
+		if (flush_buffer(buf, bufpage, offset))
+			ret = -EFAULT;
+		spin_lock(&mm->page_table_lock);
+		if (ret)
+			return ret;
+		*offset = 0;
+	}
+	kbuf = (unsigned long *)kmap_atomic(bufpage, KM_USER0);
+	kbuf[*offset] = pfn;
+	kbuf[*offset+1] = idx;
+	kbuf[*offset+2] = count;
+	kbuf[*offset+3] = prot;
+	kunmap_atomic((char *)kbuf, KM_USER0);
+	*offset += 4;
+	return 0;
+}
+
+static long supermincore(struct mm_struct *mm, unsigned long * __user buf,
+				unsigned long start, unsigned long end)
+{
+	unsigned long vaddr, offset = 0;
+	struct page *page;
+	long ret = 0;
+
+	if (!(page = alloc_page(GFP_HIGHUSER)))
+		return -ENOMEM;
+
+	spin_lock(&mm->page_table_lock);
+	for (vaddr = start; vaddr < end; vaddr += PAGE_SIZE) {
+		if ((ret = __supermincore(mm, &buf, page, vaddr, &offset)))
+			break;
+	}
+	spin_unlock(&mm->page_table_lock);
+	flush_buffer(&buf, page, &offset);
+	__free_page(page);
+	return ret;
+}
+
+asmlinkage long sys_supermincore(long pid, unsigned long * __user buf,
+				unsigned long start, unsigned long end)
+{
+	task_t *task;
+	struct mm_struct *mm;
+	long ret;
+
+	read_lock(&tasklist_lock);
+	if ((task = find_task_by_pid(pid)))
+		get_task_struct(task);
+	read_unlock(&tasklist_lock);
+	ret = -ENOENT;
+	if (task) {
+		if ((mm = get_task_mm(task))) {
+			ret = supermincore(mm, buf, start, end);
+			mmput(mm);
+		}
+		put_task_struct(task);
+	}
+	return -ENOENT;
+}
Index: mm2-2.6.10-rc2/mm/Makefile
===================================================================
--- mm2-2.6.10-rc2.orig/mm/Makefile	2004-11-20 00:57:51.000000000 -0800
+++ mm2-2.6.10-rc2/mm/Makefile	2004-11-20 13:36:40.583983211 -0800
@@ -5,7 +5,7 @@
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
-			   vmalloc.o prio_tree.o
+			   vmalloc.o prio_tree.o supermincore.o
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o \