[Tmem-devel] [PATCH] [post-4.0] tmem: add page deduplication
    Dan Magenheimer 
    dan.magenheimer at oracle.com
       
    Tue Mar  9 15:20:29 PST 2010
    
    
  
(This is for post-4.0, but I'm posting now for feedback.)
Add "page deduplication" capability to Xen-side of tmem.
(Transparent to tmem-enabled guests.)  Ephemeral pages
that have the exact same content are "combined" so that only
one page frame is needed.  Since ephemeral pages are essentially
read-only, no C-O-W (and thus no equivalent of swapping) is
necessary.
Anybody know of any good fast (SSE2?) assembly memory compare
routines?  (See tmh_page_cmp() in the patch.)
Points of interest:
- Modifications to LRU eviction algorithm to accommodate
  dedup'ed pages
- New data structures to allow lookup of matching pages
  and track references. (Algorithm used is similar to
  that used by KSM in KVM/Linux: No hashing required.)
- Lock (and rbtree) chosen by first byte of data to
  allow reasonably high concurrency without greatly
  complicating lock management.
- Statistics added so "dedup ratio" can be monitored.
- Dedup is disabled/enabled by Xen command line option.
I'm seeing 1.08-1.52 dedup ratio for two self-ballooned guests
simultaneously/continuously building linux; that's up to a
34% reduction in physical ephemeral pages used by tmem.  Clearly
this is very workload-dependent. YMMV.  To obtain this savings,
approx double the time is spent in tmem (increasing from
roughly 0.1% to roughly 0.2%).  This compares favorably to
compression which costs approximately 10x for an approximate
savings of 50%.
Signed-off-by: Dan Magenheimer <dan.magenheimer at oracle.com>
diff -r b8d2a4134a68 tools/misc/xen-tmem-list-parse.c
--- a/tools/misc/xen-tmem-list-parse.c	Wed Mar 03 17:41:58 2010 +0000
+++ b/tools/misc/xen-tmem-list-parse.c	Tue Mar 09 14:58:37 2010 -0700
@@ -110,13 +110,20 @@ void parse_global(char *s)
     unsigned long long rtree_node_max = parse(s,"Nm");
     unsigned long long pgp_count = parse(s,"Pc");
     unsigned long long pgp_max = parse(s,"Pm");
+    unsigned long long page_count = parse(s,"Fc");
+    unsigned long long max_page_count = parse(s,"Fm");
+    unsigned long long pcd_count = parse(s,"Sc");
+    unsigned long long max_pcd_count = parse(s,"Sm");
 
     printf("total tmem ops=%llu (errors=%llu) -- tmem pages avail=%llu\n",
            total_ops, errored_ops, avail_pages);
     printf("datastructs: objs=%llu (max=%llu) pgps=%llu (max=%llu) "
-           "nodes=%llu (max=%llu)\n",
+           "nodes=%llu (max=%llu) pages=%llu (max=%llu) pcds=%llu (max=%llu) "
+           "dedup ratio=%f\n",
            obj_count, obj_max, pgp_count, pgp_max,
-           rtree_node_count, rtree_node_max);
+           rtree_node_count, rtree_node_max,
+           page_count,max_page_count,pcd_count,max_pcd_count,
+           (pcd_count==0)?0.0:(global_eph_count*1.0)/pcd_count);
     printf("misc: failed_copies=%llu alloc_failed=%llu alloc_page_failed=%llu "
            "low_mem=%llu evicted=%llu/%llu relinq=%llu/%llu, "
            "max_evicts_per_relinq=%llu, flush_pools=%llu, "
diff -r b8d2a4134a68 xen/common/tmem.c
--- a/xen/common/tmem.c	Wed Mar 03 17:41:58 2010 +0000
+++ b/xen/common/tmem.c	Tue Mar 09 14:58:37 2010 -0700
@@ -79,6 +79,7 @@ static unsigned long low_on_memory = 0;
 static unsigned long low_on_memory = 0;
 static int global_obj_count_max = 0;
 static int global_pgp_count_max = 0;
+static int global_pcd_count_max = 0;
 static int global_page_count_max = 0;
 static int global_rtree_node_count_max = 0;
 static long global_eph_count_max = 0;
@@ -108,6 +109,7 @@ DECL_CYC_COUNTER(decompress);
 
 struct tm_pool;
 struct tmem_page_descriptor;
+struct tmem_page_content_descriptor;
 struct client {
     struct list_head client_list;
     struct tm_pool *pools[MAX_POOLS_PER_DOMAIN];
@@ -219,12 +221,17 @@ struct tmem_page_descriptor {
         obj_t *obj;
         uint64_t inv_oid;  /* used for invalid list only */
     };
-    uint32_t index;
     size_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
                     else compressed data (cdata) */
+    uint32_t index;
+    /* must hold pcd_tree_rwlocks[firstbyte] to use pcd pointer/siblings */
+    uint16_t firstbyte; /* NON_SHAREABLE->pfp  otherwise->pcd */
+    bool_t eviction_attempted;  /* CHANGE TO lifetimes? (settable) */
+    struct list_head pcd_siblings;
     union {
         pfp_t *pfp;  /* page frame pointer */
         char *cdata; /* compressed data */
+        struct tmem_page_content_descriptor *pcd; /* page dedup */
     };
     union {
         uint64_t timestamp;
@@ -233,6 +240,16 @@ struct tmem_page_descriptor {
     DECL_SENTINEL
 };
 typedef struct tmem_page_descriptor pgp_t;
+
+struct tmem_page_content_descriptor {
+    pfp_t *pfp;
+    struct list_head pgp_list;
+    struct rb_node pcd_rb_tree_node;
+    uint32_t count;
+};
+typedef struct tmem_page_content_descriptor pcd_t;
+struct rb_root pcd_tree_roots[256]; /* choose based on first byte of page */
+rwlock_t pcd_tree_rwlocks[256]; /* poor man's concurrency for now */
 
 static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools */
 
@@ -267,6 +284,7 @@ static long global_eph_count = 0; /* ato
 static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */
 static atomic_t global_obj_count = ATOMIC_INIT(0);
 static atomic_t global_pgp_count = ATOMIC_INIT(0);
+static atomic_t global_pcd_count = ATOMIC_INIT(0);
 static atomic_t global_page_count = ATOMIC_INIT(0);
 static atomic_t global_rtree_node_count = ATOMIC_INIT(0);
 
@@ -336,6 +354,103 @@ static NOINLINE void tmem_page_free(pool
     atomic_dec_and_assert(global_page_count);
 }
 
+/************ PAGE CONTENT DESCRIPTOR MANIPULATION ROUTINES ***********/
+
+#define NOT_SHAREABLE ((uint16_t)-1UL)
+
+/* ensure pgp no longer points to pcd, nor vice-versa */
+/* take pcd rwlock unless have_pcd_rwlock is set, always unlock when done */
+static NOINLINE void pcd_disassociate(pgp_t *pgp, pool_t *pool, bool_t have_pcd_rwlock)
+{
+    pcd_t *pcd = pgp->pcd;
+    pfp_t *pfp = pgp->pcd->pfp;
+    uint16_t firstbyte = pgp->firstbyte;
+
+    ASSERT(tmh_dedup_enabled());
+    ASSERT(firstbyte != NOT_SHAREABLE);
+    ASSERT(firstbyte < 256);
+    ASSERT(!pgp->size);
+    
+    if ( have_pcd_rwlock )
+        ASSERT_WRITELOCK(&pcd_tree_rwlocks[firstbyte]);
+    else
+        tmem_write_lock(&pcd_tree_rwlocks[firstbyte]);
+    list_del_init(&pgp->pcd_siblings);
+    pgp->pcd = NULL;
+    pgp->firstbyte = NOT_SHAREABLE;
+    pgp->size = -1;
+    if ( --pcd->count )
+    {
+        tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
+        return;
+    }
+
+    /* no more references to this pcd, recycle it and the physical page */
+    ASSERT(list_empty(&pcd->pgp_list));
+    pcd->pfp = NULL;
+    /* remove pcd from rbtree */
+    rb_erase(&pcd->pcd_rb_tree_node,&pcd_tree_roots[firstbyte]);
+    /* reinit the struct for safety for now */
+    RB_CLEAR_NODE(&pcd->pcd_rb_tree_node);
+    /* now free up the pcd memory */
+    tmem_free(pcd,sizeof(pcd_t),NULL);
+    atomic_dec_and_assert(global_pcd_count);
+    tmem_page_free(pool,pfp);
+    tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
+}
+
+static NOINLINE int pcd_associate(pgp_t *pgp, uint8_t firstbyte)
+{
+    struct rb_node **new, *parent = NULL;
+    struct rb_root *root;
+    pcd_t *pcd;
+    int cmp;
+
+    if ( !tmh_dedup_enabled() )
+        return 0;
+    ASSERT(pgp->pfp != NULL);
+    ASSERT(pgp->obj != NULL);
+    ASSERT(pgp->obj->pool != NULL);
+    ASSERT(!pgp->obj->pool->persistent);
+    tmem_write_lock(&pcd_tree_rwlocks[firstbyte]);
+    /* look for page match */
+    root = &pcd_tree_roots[firstbyte];
+    new = &(root->rb_node);
+    while ( *new )
+    {
+        pcd = container_of(*new, pcd_t, pcd_rb_tree_node);
+        parent = *new;
+        if ( (cmp = tmh_page_cmp(pgp->pfp,pcd->pfp)) < 0 )
+            new = &((*new)->rb_left);
+        else if ( cmp > 0 )
+            new = &((*new)->rb_right);
+        else
+        {
+            /* match! free the no-longer-needed page frame from the pgp */
+            tmem_page_free(pgp->obj->pool,pgp->pfp);
+            goto match;
+        }
+    }
+    /* no match, alloc a pcd and put it in the tree */
+    if ( (pcd = tmem_malloc(pcd_t, NULL)) == NULL )
+        return -ENOMEM;
+    atomic_inc_and_max(global_pcd_count);
+    RB_CLEAR_NODE(&pcd->pcd_rb_tree_node);  /* is this necessary */
+    INIT_LIST_HEAD(&pcd->pgp_list);  /* is this necessary */
+    pcd->count = 0;
+    pcd->pfp = pgp->pfp;
+    rb_link_node(&pcd->pcd_rb_tree_node, parent, new);
+    rb_insert_color(&pcd->pcd_rb_tree_node, root);
+match:
+    pcd->count++;
+    list_add(&pgp->pcd_siblings,&pcd->pgp_list);
+    pgp->firstbyte = firstbyte;
+    pgp->eviction_attempted = 0;
+    pgp->pcd = pcd;
+    tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
+    return 0;
+}
+
 /************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
 
 /* allocate a pgp_t and associate it with an object */
@@ -353,6 +468,12 @@ static NOINLINE pgp_t *pgp_alloc(obj_t *
     INIT_LIST_HEAD(&pgp->global_eph_pages);
     INIT_LIST_HEAD(&pgp->client_eph_pages);
     pgp->pfp = NULL;
+    if ( tmh_dedup_enabled() )
+    {
+        pgp->firstbyte = NOT_SHAREABLE;
+        pgp->eviction_attempted = 0;
+        INIT_LIST_HEAD(&pgp->pcd_siblings);
+    }
     pgp->size = -1;
     pgp->index = -1;
     pgp->timestamp = get_cycles();
@@ -376,7 +497,9 @@ static NOINLINE void pgp_free_data(pgp_t
 {
     if ( pgp->pfp == NULL )
         return;
-    if ( !pgp->size )
+    if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE )
+        pcd_disassociate(pgp,pool,0);
+    else if ( !pgp->size )
         tmem_page_free(pgp->obj->pool,pgp->pfp);
     else
     {
@@ -987,10 +1110,56 @@ static void client_freeze(client_t *clie
 
 /************ MEMORY REVOCATION ROUTINES *******************************/
 
+static bool_t tmem_try_to_evict_pgp(pgp_t *pgp, bool_t *hold_pool_rwlock)
+{
+    obj_t *obj = pgp->obj;
+    pool_t *pool = obj->pool;
+    client_t *client = pool->client;
+    uint16_t firstbyte = pgp->firstbyte;
+
+    if ( pool->is_dying )
+        return 0;
+    if ( tmh_lock_all && !obj->no_evict )
+       return 1; 
+    if ( tmem_spin_trylock(&obj->obj_spinlock) )
+    {
+        if ( tmh_dedup_enabled() )
+        {
+            firstbyte = pgp->firstbyte;
+            if ( firstbyte ==  NOT_SHAREABLE )
+                goto obj_unlock;
+            ASSERT(firstbyte < 256);
+            if ( !tmem_write_trylock(&pcd_tree_rwlocks[firstbyte]) )
+                goto obj_unlock;
+            if ( pgp->pcd->count > 1 && !pgp->eviction_attempted )
+            {
+                pgp->eviction_attempted++;
+                list_del(&pgp->global_eph_pages);
+                list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
+                list_del(&pgp->client_eph_pages);
+                list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list);
+                goto pcd_unlock;
+            }
+        }
+        if ( obj->pgp_count > 1 )
+            return 1;
+        if ( tmem_write_trylock(&pool->pool_rwlock) )
+        {
+            *hold_pool_rwlock = 1;
+            return 1;
+        }
+pcd_unlock:
+        tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
+obj_unlock:
+        tmem_spin_unlock(&obj->obj_spinlock);
+    }
+    return 0;
+}
+
 static int tmem_evict(void)
 {
     client_t *client = tmh_client_from_current();
-    pgp_t *pgp = NULL, *pgp_del;
+    pgp_t *pgp = NULL, *pgp2, *pgp_del;
     obj_t *obj;
     pool_t *pool;
     int ret = 0;
@@ -1001,49 +1170,15 @@ static int tmem_evict(void)
     if ( (client != NULL) && client_over_quota(client) &&
          !list_empty(&client->ephemeral_page_list) )
     {
-        list_for_each_entry(pgp,&client->ephemeral_page_list,client_eph_pages)
-        {
-            obj = pgp->obj;
-            pool = obj->pool;
-            if ( pool->is_dying )
-                continue;
-            if ( tmh_lock_all && !obj->no_evict )
+        list_for_each_entry_safe(pgp,pgp2,&client->ephemeral_page_list,client_eph_pages)
+            if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) )
                 goto found;
-            if ( tmem_spin_trylock(&obj->obj_spinlock) )
-            {
-                if ( obj->pgp_count > 1 )
-                    goto found;
-                if ( tmem_write_trylock(&pool->pool_rwlock) )
-                {
-                    hold_pool_rwlock = 1;
-                    goto found;
-                }
-                tmem_spin_unlock(&obj->obj_spinlock);
-            }
-        }
     } else if ( list_empty(&global_ephemeral_page_list) ) {
         goto out;
     } else {
-        list_for_each_entry(pgp,&global_ephemeral_page_list,global_eph_pages)
-        {
-            obj = pgp->obj;
-            pool = obj->pool;
-            if ( pool->is_dying )
-                continue;
-            if ( tmh_lock_all && !obj->no_evict )
+        list_for_each_entry_safe(pgp,pgp2,&global_ephemeral_page_list,global_eph_pages)
+            if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) )
                 goto found;
-            if ( tmem_spin_trylock(&obj->obj_spinlock) )
-            {
-                if ( obj->pgp_count > 1 )
-                    goto found;
-                if ( tmem_write_trylock(&pool->pool_rwlock) )
-                {
-                    hold_pool_rwlock = 1;
-                    goto found;
-                }
-                tmem_spin_unlock(&obj->obj_spinlock);
-            }
-        }
     }
 
     ret = 0;
@@ -1057,10 +1192,16 @@ found:
     ASSERT(obj->no_evict == 0);
     ASSERT(obj->pool != NULL);
     ASSERT_SENTINEL(obj,OBJ);
+    pool = obj->pool;
 
     ASSERT_SPINLOCK(&obj->obj_spinlock);
     pgp_del = pgp_delete_from_obj(obj, pgp->index);
     ASSERT(pgp_del == pgp);
+    if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE )
+    {
+        ASSERT(pgp->pcd->count == 1 || pgp->eviction_attempted);
+        pcd_disassociate(pgp,pool,1);
+    }
     pgp_delete(pgp,1);
     if ( obj->pgp_count == 0 )
     {
@@ -1197,6 +1338,11 @@ copy_uncompressed:
     ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,0);
     if ( ret == -EFAULT )
         goto bad_copy;
+    if ( tmh_dedup_enabled() && !is_persistent(pool) )
+    {
+        if ( pcd_associate(pgp,tmh_get_first_byte(pgp->pfp)) == -ENOMEM )
+            goto failed_dup;
+    }
     pgp->size = 0;
 
 done:
@@ -1308,13 +1454,16 @@ copy_uncompressed:
 copy_uncompressed:
     if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
     {
-        ret == -ENOMEM;
+        ret = -ENOMEM;
         goto delete_and_free;
     }
     /* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */
     ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,cva);
     if ( ret == -EFAULT )
         goto bad_copy;
+    if ( tmh_dedup_enabled() && !is_persistent(pool) )
+        if ( pcd_associate(pgp,tmh_get_first_byte(pgp->pfp)) == -ENOMEM )
+            goto delete_and_free;
     pgp->size = 0;
 
 insert_page:
@@ -1411,6 +1560,19 @@ static NOINLINE int do_tmem_get(pool_t *
                                       pgp->size, cva) == -EFAULT )
             goto bad_copy;
         END_CYC_COUNTER(decompress);
+    }
+    else if ( tmh_dedup_enabled() && !is_persistent(pool) &&
+              pgp->firstbyte != NOT_SHAREABLE )
+    {
+        uint16_t firstbyte = pgp->firstbyte;
+        int ret;
+
+        tmem_read_lock(&pcd_tree_rwlocks[firstbyte]);
+        ret = tmh_copy_to_client(cmfn, pgp->pcd->pfp, tmem_offset,
+                                 pfn_offset, len, cva);
+        tmem_read_unlock(&pcd_tree_rwlocks[firstbyte]);
+        if ( ret == -EFAULT )
+            goto bad_copy;
     }
     else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
                                  pfn_offset, len, cva) == -EFAULT)
@@ -1855,11 +2017,14 @@ static int tmemc_list_global(tmem_cli_va
       total_flush_pool, use_long ? ',' : '\n');
     if (use_long)
         n += scnprintf(info+n,BSIZE-n,
-          "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d\n",
+          "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d,"
+          "Fc:%d,Fm:%d,Sc:%d,Sm:%d\n",
           global_eph_count, global_eph_count_max,
           _atomic_read(global_obj_count), global_obj_count_max,
           _atomic_read(global_rtree_node_count), global_rtree_node_count_max,
-          _atomic_read(global_pgp_count), global_pgp_count_max);
+          _atomic_read(global_pgp_count), global_pgp_count_max,
+          _atomic_read(global_page_count), global_page_count_max,
+          _atomic_read(global_pcd_count), global_pcd_count_max);
     if ( sum + n >= len )
         return sum;
     tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
@@ -2569,10 +2734,18 @@ EXPORT void *tmem_relinquish_pages(unsig
 /* called at hypervisor startup */
 EXPORT void init_tmem(void)
 {
+    int i;
     if ( !tmh_enabled() )
         return;
 
     radix_tree_init();
+    if ( tmh_dedup_enabled() )
+        for (i = 0; i < 256; i++ )
+        {
+            pcd_tree_roots[i] = RB_ROOT;
+            rwlock_init(&pcd_tree_rwlocks[i]);
+        }
+
     if ( tmh_init() )
     {
         printk("tmem: initialized comp=%d global-lock=%d\n",
diff -r b8d2a4134a68 xen/common/tmem_xen.c
--- a/xen/common/tmem_xen.c	Wed Mar 03 17:41:58 2010 +0000
+++ b/xen/common/tmem_xen.c	Tue Mar 09 14:58:37 2010 -0700
@@ -19,6 +19,9 @@ boolean_param("tmem", opt_tmem);
 
 EXPORT int opt_tmem_compress = 0;
 boolean_param("tmem_compress", opt_tmem_compress);
+
+EXPORT int opt_tmem_dedup = 1;
+boolean_param("tmem_dedup", opt_tmem_dedup);
 
 EXPORT int opt_tmem_shared_auth = 0;
 boolean_param("tmem_shared_auth", opt_tmem_shared_auth);
diff -r b8d2a4134a68 xen/include/xen/tmem_xen.h
--- a/xen/include/xen/tmem_xen.h	Wed Mar 03 17:41:58 2010 +0000
+++ b/xen/include/xen/tmem_xen.h	Tue Mar 09 14:58:37 2010 -0700
@@ -52,6 +52,12 @@ static inline int tmh_compression_enable
 static inline int tmh_compression_enabled(void)
 {
     return opt_tmem_compress;
+}
+
+extern int opt_tmem_dedup;
+static inline int tmh_dedup_enabled(void)
+{
+    return opt_tmem_dedup;
 }
 
 extern int opt_tmem_shared_auth;
@@ -326,6 +332,28 @@ static inline bool_t tmh_current_is_priv
     return IS_PRIV(current->domain);
 }
 
+static inline uint8_t tmh_get_first_byte(pfp_t *pfp)
+{
+    void *p = __map_domain_page(pfp);
+
+    return (uint8_t)(*(char *)p);
+}
+
+static inline int tmh_page_cmp(pfp_t *pfp1, pfp_t *pfp2)
+{
+    const uint64_t *p1 = (uint64_t *)__map_domain_page(pfp1);
+    const uint64_t *p2 = (uint64_t *)__map_domain_page(pfp2);
+    int i;
+
+    // FIXME: code in assembly?
+    for ( i = PAGE_SIZE/sizeof(uint64_t); *p1 == *p2; i--, *p1++, *p2++ );
+    if ( !i )
+        return 0;
+    if ( *p1 < *p2 )
+        return -1;
+    return 1;
+}
+
 /* these typedefs are in the public/tmem.h interface
 typedef XEN_GUEST_HANDLE(void) cli_mfn_t;
 typedef XEN_GUEST_HANDLE(char) cli_va_t;
-------------- next part --------------
A non-text attachment was scrubbed...
Name: tmem-dedup.patch
Type: application/octet-stream
Size: 17452 bytes
Desc: not available
Url : http://oss.oracle.com/pipermail/tmem-devel/attachments/20100309/07695b15/attachment-0001.obj 
    
    
More information about the Tmem-devel
mailing list