[Tmem-devel] [RFC PATCH 3/4] tmem: preswap implementation (layered	on tmem)
    Dan Magenheimer 
    dan.magenheimer at oracle.com
       
    Fri Jun 19 18:36:05 PDT 2009
    
    
  
--- linux-2.6.30/mm/page_io.c	2009-06-09 21:05:27.000000000 -0600
+++ linux-2.6.30-tmem/mm/page_io.c	2009-06-19 09:33:59.000000000 -0600
@@ -102,6 +102,12 @@
 		unlock_page(page);
 		goto out;
 	}
+	if (preswap_put(page) == 1) {
+		set_page_writeback(page);
+		unlock_page(page);
+		end_page_writeback(page);
+		goto out;
+	}
 	bio = get_swap_bio(GFP_NOIO, page_private(page), page,
 				end_swap_bio_write);
 	if (bio == NULL) {
@@ -134,6 +140,12 @@
 		ret = -ENOMEM;
 		goto out;
 	}
+	if (preswap_get(page) == 1) {
+		SetPageUptodate(page);
+		unlock_page(page);
+		bio_put(bio);
+		goto out;
+	}
 	count_vm_event(PSWPIN);
 	submit_bio(READ, bio);
 out:
--- linux-2.6.30/mm/swapfile.c	2009-06-09 21:05:27.000000000 -0600
+++ linux-2.6.30-tmem/mm/swapfile.c	2009-06-19 16:20:14.000000000 -0600
@@ -35,7 +35,7 @@
 #include <linux/swapops.h>
 #include <linux/page_cgroup.h>
 
-static DEFINE_SPINLOCK(swap_lock);
+DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
 long nr_swap_pages;
 long total_swap_pages;
@@ -47,7 +47,7 @@
 static const char Bad_offset[] = "Bad swap offset entry ";
 static const char Unused_offset[] = "Unused swap offset entry ";
 
-static struct swap_list_t swap_list = {-1, -1};
+struct swap_list_t swap_list = {-1, -1};
 
 static struct swap_info_struct swap_info[MAX_SWAPFILES];
 
@@ -488,6 +488,7 @@
 				swap_list.next = p - swap_info;
 			nr_swap_pages++;
 			p->inuse_pages--;
+			preswap_flush(p - swap_info, offset);
 			mem_cgroup_uncharge_swap(ent);
 		}
 	}
@@ -864,7 +865,7 @@
  * Recycle to start on reaching the end, returning 0 when empty.
  */
 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
-					unsigned int prev)
+				unsigned int prev, unsigned int preswap)
 {
 	unsigned int max = si->max;
 	unsigned int i = prev;
@@ -890,6 +891,12 @@
 			prev = 0;
 			i = 1;
 		}
+		if (preswap) {
+			if (preswap_test(si, i))
+				break;
+			else
+				continue;
+		}
 		count = si->swap_map[i];
 		if (count && count != SWAP_MAP_BAD)
 			break;
@@ -901,8 +908,12 @@
  * We completely avoid races by reading each swap page in advance,
  * and then search for the process using it.  All the necessary
  * page table adjustments can then be made atomically.
+ *
+ * if the boolean preswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages
  */
-static int try_to_unuse(unsigned int type)
+int try_to_unuse(unsigned int type, unsigned int preswap,
+		unsigned long pages_to_unuse)
 {
 	struct swap_info_struct * si = &swap_info[type];
 	struct mm_struct *start_mm;
@@ -938,7 +949,7 @@
 	 * one pass through swap_map is enough, but not necessarily:
 	 * there are races when an instance of an entry might be missed.
 	 */
-	while ((i = find_next_to_unuse(si, i)) != 0) {
+	while ((i = find_next_to_unuse(si, i, preswap)) != 0) {
 		if (signal_pending(current)) {
 			retval = -EINTR;
 			break;
@@ -1124,6 +1135,8 @@
 		 * interactive performance.
 		 */
 		cond_resched();
+		if (preswap && pages_to_unuse && !--pages_to_unuse)
+			break;
 	}
 
 	mmput(start_mm);
@@ -1448,7 +1461,7 @@
 	spin_unlock(&swap_lock);
 
 	current->flags |= PF_SWAPOFF;
-	err = try_to_unuse(type);
+	err = try_to_unuse(type, 0, 0);
 	current->flags &= ~PF_SWAPOFF;
 
 	if (err) {
@@ -1497,9 +1510,14 @@
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
 	p->flags = 0;
+	preswap_flush_area(p - swap_info);
 	spin_unlock(&swap_lock);
 	mutex_unlock(&swapon_mutex);
 	vfree(swap_map);
+#ifdef CONFIG_PRESWAP
+	if (p->preswap_map)
+		vfree(p->preswap_map);
+#endif
 	/* Destroy swap account informatin */
 	swap_cgroup_swapoff(type);
 
@@ -1812,6 +1830,11 @@
 	}
 
 	memset(swap_map, 0, maxpages * sizeof(short));
+#ifdef CONFIG_PRESWAP
+	p->preswap_map = vmalloc(maxpages / sizeof(long));
+	if (p->preswap_map)
+		memset(p->preswap_map, 0, maxpages / sizeof(long));
+#endif
 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
 		int page_nr = swap_header->info.badpages[i];
 		if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
@@ -1886,6 +1909,7 @@
 	} else {
 		swap_info[prev].next = p - swap_info;
 	}
+	preswap_init(p - swap_info);
 	spin_unlock(&swap_lock);
 	mutex_unlock(&swapon_mutex);
 	error = 0;
@@ -2002,6 +2026,8 @@
 
 	si = &swap_info[swp_type(entry)];
 	target = swp_offset(entry);
+	if (preswap_test(si, target))
+		return 0;
 	base = (target >> our_page_cluster) << our_page_cluster;
 	end = base + (1 << our_page_cluster);
 	if (!base)		/* first page is swap header */
@@ -2018,6 +2044,9 @@
 			break;
 		if (si->swap_map[toff] == SWAP_MAP_BAD)
 			break;
+		/* Don't read in preswap pages */
+		if (preswap_test(si, toff))
+			break;
 	}
 	/* Count contiguous allocated slots below our target */
 	for (toff = target; --toff >= base; nr_pages++) {
--- linux-2.6.30/include/linux/swap.h	2009-06-09 21:05:27.000000000 -0600
+++ linux-2.6.30-tmem/include/linux/swap.h	2009-06-19 12:51:55.000000000 -0600
@@ -8,6 +8,7 @@
 #include <linux/memcontrol.h>
 #include <linux/sched.h>
 #include <linux/node.h>
+#include <linux/vmalloc.h>
 
 #include <asm/atomic.h>
 #include <asm/page.h>
@@ -154,8 +155,62 @@
 	unsigned int max;
 	unsigned int inuse_pages;
 	unsigned int old_block_size;
+#ifdef CONFIG_PRESWAP
+	unsigned long *preswap_map;
+	unsigned int preswap_pages;
+#endif
 };
 
+#ifdef CONFIG_PRESWAP
+
+#include <linux/sysctl.h>
+extern int preswap_sysctl_handler(struct ctl_table *, int, struct file *,
+	void __user *, size_t *, loff_t *);
+extern const unsigned long preswap_zero, preswap_infinity;
+
+extern void preswap_shrink(unsigned long);
+extern int preswap_test(struct swap_info_struct *, unsigned long);
+extern void preswap_init(unsigned);
+extern int preswap_put(struct page *);
+extern int preswap_get(struct page *);
+extern void preswap_flush(unsigned, unsigned long);
+extern void preswap_flush_area(unsigned);
+/* in swapfile.c */
+extern int try_to_unuse(unsigned int, unsigned int, unsigned long);
+#else
+static inline void preswap_shrink(unsigned long target_pages)
+{
+}
+
+static inline int preswap_test(struct swap_info_struct *sis,
+	unsigned long offset)
+{
+	return 0;
+}
+
+static inline void preswap_init(unsigned type)
+{
+}
+
+static inline int preswap_put(struct page *page)
+{
+	return 0;
+}
+
+static inline int preswap_get(struct page *page)
+{
+	return 0;
+}
+
+static inline void preswap_flush(unsigned type, unsigned long offset)
+{
+}
+
+static inline void preswap_flush_area(unsigned type)
+{
+}
+#endif /* CONFIG_PRESWAP */
+
 struct swap_list_t {
 	int head;	/* head of priority-ordered swapfile list */
 	int next;	/* swapfile to be used next */
@@ -312,6 +367,8 @@
 extern int reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
+extern struct swap_list_t swap_list;
+extern spinlock_t swap_lock;
 
 /* linux/mm/thrash.c */
 extern struct mm_struct * swap_token_mm;
--- linux-2.6.30/mm/preswap.c	1969-12-31 17:00:00.000000000 -0700
+++ linux-2.6.30-tmem/mm/preswap.c	2009-06-19 14:55:16.000000000 -0600
@@ -0,0 +1,274 @@
+/*
+ * linux/mm/preswap.c
+ *
+ * Implements a fast "preswap" on top of the transcendent memory ("tmem") API.
+ * When a swapdisk is enabled (with swapon), a "private persistent tmem pool"
+ * is created along with a bit-per-page preswap_map.  When swapping occurs
+ * and a page is about to be written to disk, a "put" into the pool may first
+ * be attempted by passing the pageframe to be swapped, along with a "handle"
+ * consisting of a pool_id, an object id, and an index.  Since the pool is of
+ * indeterminate size, the "put" may be rejected, in which case the page
+ * is swapped to disk as normal.  If the "put" is successful, the page is
+ * copied to tmem and the preswap_map records the success.  Later, when
+ * the page needs to be swapped in, the preswap_map is checked and, if set,
+ * the page may be obtained with a "get" operation.  Note that the swap
+ * subsystem is responsible for: maintaining coherency between the swapcache,
+ * preswap, and the swapdisk; for evicting stale pages from preswap; and for
+ * emptying preswap when swapoff is performed. The "flush page" and "flush
+ * object" actions are provided for this.
+ *
+ * Note that if a "duplicate put" is performed to overwrite a page and
+ * the "put" operation fails, the page (and old data) is flushed and lost.
+ * Also note that multiple accesses to a tmem pool may be concurrent and
+ * any ordering must be guaranteed by the caller.
+ *
+ * Copyright (C) 2008,2009 Dan Magenheimer, Oracle Corp.
+ */
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sysctl.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/capability.h>
+#include <linux/uaccess.h>
+#include <linux/tmem.h>
+
+static u32 preswap_poolid = -1; /* if negative, preswap will never call tmem */
+
+const unsigned long preswap_zero = 0, preswap_infinity = ~0UL; /* for sysctl */
+
+/*
+ * Swizzling increases objects per swaptype, increasing tmem concurrency
+ * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
+ */
+#define SWIZ_BITS		4
+#define SWIZ_MASK		((1 << SWIZ_BITS) - 1)
+#define oswiz(_type, _ind)	((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
+#define iswiz(_ind)		(_ind >> SWIZ_BITS)
+
+/*
+ * preswap_map test/set/clear operations (must be atomic)
+ */
+
+int preswap_test(struct swap_info_struct *sis, unsigned long offset)
+{
+	if (!sis->preswap_map)
+		return 0;
+	return test_bit(offset % BITS_PER_LONG,
+		&sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+static inline void preswap_set(struct swap_info_struct *sis,
+				unsigned long offset)
+{
+	if (!sis->preswap_map)
+		return;
+	set_bit(offset % BITS_PER_LONG,
+		&sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+static inline void preswap_clear(struct swap_info_struct *sis,
+				unsigned long offset)
+{
+	if (!sis->preswap_map)
+		return;
+	clear_bit(offset % BITS_PER_LONG,
+		&sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+/*
+ * preswap tmem operations
+ */
+
+/* returns 1 if the page was successfully put into preswap, 0 if the page
+ * was declined, and -ERRNO for a specific error */
+int preswap_put(struct page *page)
+{
+	swp_entry_t entry = { .val = page_private(page), };
+	unsigned type = swp_type(entry);
+	pgoff_t offset = swp_offset(entry);
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	unsigned long pfn = page_to_pfn(page);
+	struct swap_info_struct *sis = get_swap_info_struct(type);
+	int dup = 0, ret;
+
+	if ((s32)preswap_poolid < 0)
+		return 0;
+	if (ind64 != ind)
+		return 0;
+	if (preswap_test(sis, offset))
+		dup = 1;
+	mb(); /* ensure page is quiescent; tmem may address it with an alias */
+	ret = (*tmem_ops->put_page)(preswap_poolid, oswiz(type, ind),
+		iswiz(ind), pfn);
+	if (ret == 1) {
+		preswap_set(sis, offset);
+		if (!dup)
+			sis->preswap_pages++;
+	} else if (dup) {
+		/* failed dup put always results in an automatic flush of
+		 * the (older) page from preswap */
+		preswap_clear(sis, offset);
+		sis->preswap_pages--;
+	}
+	return ret;
+}
+
+/* returns 1 if the page was successfully gotten from preswap, 0 if the page
+ * was not present (should never happen!), and -ERRNO for a specific error */
+int preswap_get(struct page *page)
+{
+	swp_entry_t entry = { .val = page_private(page), };
+	unsigned type = swp_type(entry);
+	pgoff_t offset = swp_offset(entry);
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	unsigned long pfn = page_to_pfn(page);
+	struct swap_info_struct *sis = get_swap_info_struct(type);
+	int ret;
+
+	if ((s32)preswap_poolid < 0)
+		return 0;
+	if (ind64 != ind)
+		return 0;
+	if (!preswap_test(sis, offset))
+		return 0;
+	ret = (*tmem_ops->get_page)(preswap_poolid, oswiz(type, ind),
+		iswiz(ind), pfn);
+	return ret;
+}
+
+/* flush a single page from preswap */
+void preswap_flush(unsigned type, unsigned long offset)
+{
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	struct swap_info_struct *sis = get_swap_info_struct(type);
+	int ret = 1;
+
+	if ((s32)preswap_poolid < 0)
+		return;
+	if (ind64 != ind)
+		return;
+	if (preswap_test(sis, offset)) {
+		ret = (*tmem_ops->flush_page)(preswap_poolid,
+					oswiz(type, ind), iswiz(ind));
+		sis->preswap_pages--;
+		preswap_clear(sis, offset);
+	}
+}
+
+/* flush all pages from the passed swaptype */
+void preswap_flush_area(unsigned type)
+{
+	struct swap_info_struct *sis = get_swap_info_struct(type);
+	int ind;
+
+	if ((s32)preswap_poolid < 0)
+		return;
+	for (ind = SWIZ_MASK; ind >= 0; ind--)
+		(void)(*tmem_ops->flush_object)(preswap_poolid,
+			oswiz(type, ind));
+	sis->preswap_pages = 0;
+}
+
+void preswap_init(unsigned type)
+{
+	/* only need one tmem pool for all swap types */
+	if ((s32)preswap_poolid >= 0)
+		return;
+	if (tmem_ops == NULL)
+		return;
+	preswap_poolid = (*tmem_ops->new_pool)(0, 0, TMEM_POOL_PERSIST);
+}
+
+/*
+ * preswap infrastructure functions
+ */
+
+/* code structure leveraged from sys_swapoff */
+void preswap_shrink(unsigned long target_pages)
+{
+	struct swap_info_struct *si = NULL;
+	unsigned long total_pages = 0, total_pages_to_unuse;
+	unsigned long pages = 0, unuse_pages = 0;
+	int type;
+	int wrapped = 0;
+
+	do {
+		/*
+		 * we don't want to hold swap_lock while doing a very
+		 * lengthy try_to_unuse, but swap_list may change
+		 * so restart scan from swap_list.head each time
+		 */
+		spin_lock(&swap_lock);
+		total_pages = 0;
+		for (type = swap_list.head; type >= 0; type = si->next) {
+			si = get_swap_info_struct(type);
+			total_pages += si->preswap_pages;
+		}
+		if (total_pages <= target_pages) {
+			spin_unlock(&swap_lock);
+			return;
+		}
+		total_pages_to_unuse = total_pages - target_pages;
+		for (type = swap_list.head; type >= 0; type = si->next) {
+			si = get_swap_info_struct(type);
+			if (total_pages_to_unuse < si->preswap_pages)
+				pages = unuse_pages = total_pages_to_unuse;
+			else {
+				pages = si->preswap_pages;
+				unuse_pages = 0; /* unuse all */
+			}
+			if (security_vm_enough_memory(pages))
+				continue;
+			vm_unacct_memory(pages);
+			break;
+		}
+		spin_unlock(&swap_lock);
+		if (type < 0)
+			return;
+		current->flags |= PF_SWAPOFF;
+		(void)try_to_unuse(type, 1, unuse_pages);
+		current->flags &= ~PF_SWAPOFF;
+		wrapped++;
+	} while (wrapped <= 3);
+}
+
+
+#ifdef CONFIG_SYSCTL
+/* cat /sys/proc/vm/preswap provides total number of pages in preswap
+ * across all swaptypes.  echo N > /sys/proc/vm/preswap attempts to shrink
+ * preswap page usage to N (usually 0) */
+int preswap_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	unsigned long npages;
+	int type;
+	unsigned long totalpages = 0;
+	struct swap_info_struct *si = NULL;
+
+	/* modeled after hugetlb_sysctl_handler in mm/hugetlb.c */
+	if (!write) {
+		spin_lock(&swap_lock);
+		for (type = swap_list.head; type >= 0; type = si->next) {
+			si = get_swap_info_struct(type);
+			totalpages += si->preswap_pages;
+		}
+		spin_unlock(&swap_lock);
+		npages = totalpages;
+	}
+	table->data = &npages;
+	table->maxlen = sizeof(unsigned long);
+	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+
+	if (write)
+		preswap_shrink(npages);
+
+	return 0;
+}
+#endif
--- linux-2.6.30/include/linux/sysctl.h	2009-06-09 21:05:27.000000000 -0600
+++ linux-2.6.30-tmem/include/linux/sysctl.h	2009-06-19 09:33:59.000000000 -0600
@@ -205,6 +205,7 @@
 	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
 	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
 	VM_MIN_SLAB=35,		 /* Percent pages ignored by zone reclaim */
+	VM_PRESWAP_PAGES=36,	/* pages/target_pages in preswap */
 };
 
 
--- linux-2.6.30/kernel/sysctl.c	2009-06-09 21:05:27.000000000 -0600
+++ linux-2.6.30-tmem/kernel/sysctl.c	2009-06-19 09:33:59.000000000 -0600
@@ -1282,6 +1282,18 @@
 		.proc_handler	= &scan_unevictable_handler,
 	},
 #endif
+#ifdef CONFIG_PRESWAP
+	{
+		.ctl_name	= VM_PRESWAP_PAGES,
+		.procname	= "preswap",
+		.data		= NULL,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &preswap_sysctl_handler,
+		.extra1		= (void *)&preswap_zero,
+		.extra2		= (void *)&preswap_infinity,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
    
    
More information about the Tmem-devel
mailing list