diff options
Diffstat (limited to 'drivers/staging/zcache/tmem.c')
-rw-r--r-- | drivers/staging/zcache/tmem.c | 327 |
1 files changed, 224 insertions, 103 deletions
diff --git a/drivers/staging/zcache/tmem.c b/drivers/staging/zcache/tmem.c index 56c8e606ad1c..a2b7e03b6062 100644 --- a/drivers/staging/zcache/tmem.c +++ b/drivers/staging/zcache/tmem.c @@ -1,32 +1,43 @@ /* * In-kernel transcendent memory (generic implementation) * - * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. + * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp. * - * The primary purpose of Transcendent Memory ("tmem") is to map object-oriented + * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented * "handles" (triples containing a pool id, and object id, and an index), to * pages in a page-accessible memory (PAM). Tmem references the PAM pages via * an abstract "pampd" (PAM page-descriptor), which can be operated on by a * set of functions (pamops). Each pampd contains some representation of - * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of - * pages and must be able to insert, find, and delete these pages at a - * potential frequency of thousands per second concurrently across many CPUs, - * (and, if used with KVM, across many vcpus across many guests). - * Tmem is tracked with a hierarchy of data structures, organized by - * the elements in a handle-tuple: pool_id, object_id, and page index. - * One or more "clients" (e.g. guests) each provide one or more tmem_pools. - * Each pool, contains a hash table of rb_trees of tmem_objs. Each - * tmem_obj contains a radix-tree-like tree of pointers, with intermediate - * nodes called tmem_objnodes. Each leaf pointer in this tree points to - * a pampd, which is accessible only through a small set of callbacks - * registered by the PAM implementation (see tmem_register_pamops). Tmem - * does all memory allocation via a set of callbacks registered by the tmem - * host implementation (e.g. see tmem_register_hostops). + * PAGE_SIZE bytes worth of data. For those familiar with key-value stores, + * the tmem handle is a three-level hierarchical key, and the value is always + * reconstituted (but not necessarily stored) as PAGE_SIZE bytes and is + * referenced in the datastore by the pampd. The hierarchy is required + * to ensure that certain invalidation functions can be performed efficiently + * (i.e. flush all indexes associated with this object_id, or + * flush all objects associated with this pool). + * + * Tmem must support potentially millions of pages and must be able to insert, + * find, and delete these pages at a potential frequency of thousands per + * second concurrently across many CPUs, (and, if used with KVM, across many + * vcpus across many guests). Tmem is tracked with a hierarchy of data + * structures, organized by the elements in the handle-tuple: pool_id, + * object_id, and page index. One or more "clients" (e.g. guests) each + * provide one or more tmem_pools. Each pool, contains a hash table of + * rb_trees of tmem_objs. Each tmem_obj contains a radix-tree-like tree + * of pointers, with intermediate nodes called tmem_objnodes. Each leaf + * pointer in this tree points to a pampd, which is accessible only through + * a small set of callbacks registered by the PAM implementation (see + * tmem_register_pamops). Tmem only needs to memory allocation for objs + * and objnodes and this is done via a set of callbacks that must be + * registered by the tmem host implementation (e.g. see tmem_register_hostops). */ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/atomic.h> +#ifdef CONFIG_RAMSTER +#include <linux/delay.h> +#endif #include "tmem.h" @@ -51,7 +62,7 @@ void tmem_register_hostops(struct tmem_hostops *m) /* * A tmem host implementation must use this function to register - * callbacks for a page-accessible memory (PAM) implementation + * callbacks for a page-accessible memory (PAM) implementation. */ static struct tmem_pamops tmem_pamops; @@ -66,15 +77,22 @@ void tmem_register_pamops(struct tmem_pamops *m) * So an rb_tree is an ideal data structure to manage tmem_objs. But because * of the potentially huge number of tmem_objs, each pool manages a hashtable * of rb_trees to reduce search, insert, delete, and rebalancing time. - * Each hashbucket also has a lock to manage concurrent access. + * Each hashbucket also has a lock to manage concurrent access and no + * searches, inserts, or deletions can be performed unless the lock is held. + * As a result, care must be taken to ensure tmem routines are not called + * recursively; the vast majority of the time, a recursive call may work + * but a deadlock will occur a small fraction of the time due to the + * hashbucket lock. * - * The following routines manage tmem_objs. When any tmem_obj is accessed, - * the hashbucket lock must be held. + * The following routines manage tmem_objs. In all of these routines, + * the hashbucket lock is already held. */ -static struct tmem_obj -*__tmem_obj_find(struct tmem_hashbucket*hb, struct tmem_oid *oidp, - struct rb_node **parent, struct rb_node ***link) +/* Search for object==oid in pool, returns object if found. */ +static struct tmem_obj *__tmem_obj_find(struct tmem_hashbucket *hb, + struct tmem_oid *oidp, + struct rb_node **parent, + struct rb_node ***link) { struct rb_node *_parent = NULL, **rbnode; struct tmem_obj *obj = NULL; @@ -101,23 +119,20 @@ static struct tmem_obj *parent = _parent; if (link) *link = rbnode; - obj = NULL; out: return obj; } - -/* searches for object==oid in pool, returns locked object if found */ static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb, struct tmem_oid *oidp) { return __tmem_obj_find(hb, oidp, NULL, NULL); } -static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *); +static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *, bool); -/* free an object that has no more pampds in it */ +/* Free an object that has no more pampds in it. */ static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) { struct tmem_pool *pool; @@ -128,7 +143,7 @@ static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) pool = obj->pool; BUG_ON(pool == NULL); if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */ - tmem_pampd_destroy_all_in_obj(obj); + tmem_pampd_destroy_all_in_obj(obj, false); BUG_ON(obj->objnode_tree_root != NULL); BUG_ON((long)obj->objnode_count != 0); atomic_dec(&pool->obj_count); @@ -140,7 +155,7 @@ static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) } /* - * initialize, and insert an tmem_object_root (called only if find failed) + * Initialize, and insert an tmem_object_root (called only if find failed). */ static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, struct tmem_pool *pool, @@ -157,7 +172,10 @@ static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, obj->oid = *oidp; obj->objnode_count = 0; obj->pampd_count = 0; - (*tmem_pamops.new_obj)(obj); +#ifdef CONFIG_RAMSTER + if (tmem_pamops.new_obj != NULL) + (*tmem_pamops.new_obj)(obj); +#endif SET_SENTINEL(obj, OBJ); if (__tmem_obj_find(hb, oidp, &parent, &new)) @@ -172,7 +190,7 @@ static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, * "ephemeral" vs "persistent". These attributes apply to all tmem_objs * and all pampds that belong to a tmem_pool. A tmem_pool is created * or deleted relatively rarely (for example, when a filesystem is - * mounted or unmounted. + * mounted or unmounted). */ /* flush all data from a pool and, optionally, free it */ @@ -190,7 +208,7 @@ static void tmem_pool_flush(struct tmem_pool *pool, bool destroy) while (rbnode != NULL) { obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); rbnode = rb_next(rbnode); - tmem_pampd_destroy_all_in_obj(obj); + tmem_pampd_destroy_all_in_obj(obj, true); tmem_obj_free(obj, hb); (*tmem_hostops.obj_free)(obj, pool); } @@ -276,7 +294,7 @@ static void tmem_objnode_free(struct tmem_objnode *objnode) } /* - * lookup index in object and return associated pampd (or NULL if not found) + * Lookup index in object and return associated pampd (or NULL if not found). */ static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) { @@ -318,8 +336,9 @@ static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) return slot != NULL ? *slot : NULL; } +#ifdef CONFIG_RAMSTER static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index, - void *new_pampd) + void *new_pampd, bool no_free) { struct tmem_objnode **slot; void *ret = NULL; @@ -328,11 +347,14 @@ static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index, if ((slot != NULL) && (*slot != NULL)) { void *old_pampd = *(void **)slot; *(void **)slot = new_pampd; - (*tmem_pamops.free)(old_pampd, obj->pool, NULL, 0); + if (!no_free) + (*tmem_pamops.free)(old_pampd, obj->pool, + NULL, 0, false); ret = new_pampd; } return ret; } +#endif static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index, void *pampd) @@ -470,7 +492,7 @@ out: return slot; } -/* recursively walk the objnode_tree destroying pampds and objnodes */ +/* Recursively walk the objnode_tree destroying pampds and objnodes. */ static void tmem_objnode_node_destroy(struct tmem_obj *obj, struct tmem_objnode *objnode, unsigned int ht) @@ -484,7 +506,7 @@ static void tmem_objnode_node_destroy(struct tmem_obj *obj, if (ht == 1) { obj->pampd_count--; (*tmem_pamops.free)(objnode->slots[i], - obj->pool, NULL, 0); + obj->pool, NULL, 0, true); objnode->slots[i] = NULL; continue; } @@ -495,13 +517,15 @@ static void tmem_objnode_node_destroy(struct tmem_obj *obj, } } -static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) +static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj, + bool pool_destroy) { if (obj->objnode_tree_root == NULL) return; if (obj->objnode_tree_height == 0) { obj->pampd_count--; - (*tmem_pamops.free)(obj->objnode_tree_root, obj->pool, NULL, 0); + (*tmem_pamops.free)(obj->objnode_tree_root, + obj->pool, NULL, 0, true); } else { tmem_objnode_node_destroy(obj, obj->objnode_tree_root, obj->objnode_tree_height); @@ -509,7 +533,10 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) obj->objnode_tree_height = 0; } obj->objnode_tree_root = NULL; - (*tmem_pamops.free_obj)(obj->pool, obj); +#ifdef CONFIG_RAMSTER + if (tmem_pamops.free_obj != NULL) + (*tmem_pamops.free_obj)(obj->pool, obj, pool_destroy); +#endif } /* @@ -522,17 +549,16 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) */ /* - * "Put" a page, e.g. copy a page from the kernel into newly allocated - * PAM space (if such space is available). Tmem_put is complicated by - * a corner case: What if a page with matching handle already exists in - * tmem? To guarantee coherency, one of two actions is necessary: Either - * the data for the page must be overwritten, or the page must be - * "flushed" so that the data is not accessible to a subsequent "get". - * Since these "duplicate puts" are relatively rare, this implementation - * always flushes for simplicity. + * "Put" a page, e.g. associate the passed pampd with the passed handle. + * Tmem_put is complicated by a corner case: What if a page with matching + * handle already exists in tmem? To guarantee coherency, one of two + * actions is necessary: Either the data for the page must be overwritten, + * or the page must be "flushed" so that the data is not accessible to a + * subsequent "get". Since these "duplicate puts" are relatively rare, + * this implementation always flushes for simplicity. */ int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, - char *data, size_t size, bool raw, bool ephemeral) + bool raw, void *pampd_to_use) { struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL; void *pampd = NULL, *pampd_del = NULL; @@ -548,7 +574,7 @@ int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, /* if found, is a dup put, flush the old one */ pampd_del = tmem_pampd_delete_from_obj(obj, index); BUG_ON(pampd_del != pampd); - (*tmem_pamops.free)(pampd, pool, oidp, index); + (*tmem_pamops.free)(pampd, pool, oidp, index, true); if (obj->pampd_count == 0) { objnew = obj; objfound = NULL; @@ -565,21 +591,19 @@ int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, } BUG_ON(obj == NULL); BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound)); - pampd = (*tmem_pamops.create)(data, size, raw, ephemeral, - obj->pool, &obj->oid, index); - if (unlikely(pampd == NULL)) - goto free; + pampd = pampd_to_use; + BUG_ON(pampd_to_use == NULL); ret = tmem_pampd_add_to_obj(obj, index, pampd); if (unlikely(ret == -ENOMEM)) /* may have partially built objnode tree ("stump") */ goto delete_and_free; + (*tmem_pamops.create_finish)(pampd, is_ephemeral(pool)); goto out; delete_and_free: (void)tmem_pampd_delete_from_obj(obj, index); -free: if (pampd) - (*tmem_pamops.free)(pampd, pool, NULL, 0); + (*tmem_pamops.free)(pampd, pool, NULL, 0, true); if (objnew) { tmem_obj_free(objnew, hb); (*tmem_hostops.obj_free)(objnew, pool); @@ -589,35 +613,160 @@ out: return ret; } +#ifdef CONFIG_RAMSTER +/* + * For ramster only: The following routines provide a two-step sequence + * to allow the caller to replace a pampd in the tmem data structures with + * another pampd. Here, we lookup the passed handle and, if found, return the + * associated pampd and object, leaving the hashbucket locked and returning + * a reference to it. The caller is expected to immediately call the + * matching tmem_localify_finish routine which will handles the replacement + * and unlocks the hashbucket. + */ +void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp, + uint32_t index, struct tmem_obj **ret_obj, + void **saved_hb) +{ + struct tmem_hashbucket *hb; + struct tmem_obj *obj = NULL; + void *pampd = NULL; + + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + obj = tmem_obj_find(hb, oidp); + if (likely(obj != NULL)) + pampd = tmem_pampd_lookup_in_obj(obj, index); + *ret_obj = obj; + *saved_hb = (void *)hb; + /* note, hashbucket remains locked */ + return pampd; +} + +void tmem_localify_finish(struct tmem_obj *obj, uint32_t index, + void *pampd, void *saved_hb, bool delete) +{ + struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb; + + BUG_ON(!spin_is_locked(&hb->lock)); + if (pampd != NULL) { + BUG_ON(obj == NULL); + (void)tmem_pampd_replace_in_obj(obj, index, pampd, 1); + (*tmem_pamops.create_finish)(pampd, is_ephemeral(obj->pool)); + } else if (delete) { + BUG_ON(obj == NULL); + (void)tmem_pampd_delete_from_obj(obj, index); + } + spin_unlock(&hb->lock); +} + +/* + * For ramster only. Helper function to support asynchronous tmem_get. + */ +static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb, + struct tmem_pool *pool, struct tmem_oid *oidp, + uint32_t index, bool free, char *data) +{ + void *old_pampd = *ppampd, *new_pampd = NULL; + bool intransit = false; + int ret = 0; + + if (!is_ephemeral(pool)) + new_pampd = (*tmem_pamops.repatriate_preload)( + old_pampd, pool, oidp, index, &intransit); + if (intransit) + ret = -EAGAIN; + else if (new_pampd != NULL) + *ppampd = new_pampd; + /* must release the hb->lock else repatriate can't sleep */ + spin_unlock(&hb->lock); + if (!intransit) + ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool, + oidp, index, free, data); + if (ret == -EAGAIN) { + /* rare I think, but should cond_resched()??? */ + usleep_range(10, 1000); + } else if (ret == -ENOTCONN || ret == -EHOSTDOWN) { + ret = -1; + } else if (ret != 0 && ret != -ENOENT) { + ret = -1; + } + /* note hb->lock has now been unlocked */ + return ret; +} + /* - * "Get" a page, e.g. if one can be found, copy the tmem page with the - * matching handle from PAM space to the kernel. By tmem definition, - * when a "get" is successful on an ephemeral page, the page is "flushed", - * and when a "get" is successful on a persistent page, the page is retained - * in tmem. Note that to preserve + * For ramster only. If a page in tmem matches the handle, replace the + * page so that any subsequent "get" gets the new page. Returns 0 if + * there was a page to replace, else returns -1. + */ +int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp, + uint32_t index, void *new_pampd) +{ + struct tmem_obj *obj; + int ret = -1; + struct tmem_hashbucket *hb; + + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + obj = tmem_obj_find(hb, oidp); + if (obj == NULL) + goto out; + new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0); + /* if we bug here, pamops wasn't properly set up for ramster */ + BUG_ON(tmem_pamops.replace_in_obj == NULL); + ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj); +out: + spin_unlock(&hb->lock); + return ret; +} +#endif + +/* + * "Get" a page, e.g. if a pampd can be found matching the passed handle, + * use a pamops callback to recreated the page from the pampd with the + * matching handle. By tmem definition, when a "get" is successful on + * an ephemeral page, the page is "flushed", and when a "get" is successful + * on a persistent page, the page is retained in tmem. Note that to preserve * coherency, "get" can never be skipped if tmem contains the data. * That is, if a get is done with a certain handle and fails, any * subsequent "get" must also fail (unless of course there is a * "put" done with the same handle). - */ int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, - char *data, size_t *size, bool raw, int get_and_free) + char *data, size_t *sizep, bool raw, int get_and_free) { struct tmem_obj *obj; - void *pampd; + void *pampd = NULL; bool ephemeral = is_ephemeral(pool); int ret = -1; struct tmem_hashbucket *hb; bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral); bool lock_held = false; + void **ppampd; - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - lock_held = true; - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; + do { + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + lock_held = true; + obj = tmem_obj_find(hb, oidp); + if (obj == NULL) + goto out; + ppampd = __tmem_pampd_lookup_in_obj(obj, index); + if (ppampd == NULL) + goto out; +#ifdef CONFIG_RAMSTER + if ((tmem_pamops.is_remote != NULL) && + tmem_pamops.is_remote(*ppampd)) { + ret = tmem_repatriate(ppampd, hb, pool, oidp, + index, free, data); + /* tmem_repatriate releases hb->lock */ + lock_held = false; + *sizep = PAGE_SIZE; + if (ret != -EAGAIN) + goto out; + } +#endif + } while (ret == -EAGAIN); if (free) pampd = tmem_pampd_delete_from_obj(obj, index); else @@ -631,16 +780,12 @@ int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, obj = NULL; } } - if (tmem_pamops.is_remote(pampd)) { - lock_held = false; - spin_unlock(&hb->lock); - } if (free) ret = (*tmem_pamops.get_data_and_free)( - data, size, raw, pampd, pool, oidp, index); + data, sizep, raw, pampd, pool, oidp, index); else ret = (*tmem_pamops.get_data)( - data, size, raw, pampd, pool, oidp, index); + data, sizep, raw, pampd, pool, oidp, index); if (ret < 0) goto out; ret = 0; @@ -671,7 +816,7 @@ int tmem_flush_page(struct tmem_pool *pool, pampd = tmem_pampd_delete_from_obj(obj, index); if (pampd == NULL) goto out; - (*tmem_pamops.free)(pampd, pool, oidp, index); + (*tmem_pamops.free)(pampd, pool, oidp, index, true); if (obj->pampd_count == 0) { tmem_obj_free(obj, hb); (*tmem_hostops.obj_free)(obj, pool); @@ -684,30 +829,6 @@ out: } /* - * If a page in tmem matches the handle, replace the page so that any - * subsequent "get" gets the new page. Returns 0 if - * there was a page to replace, else returns -1. - */ -int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp, - uint32_t index, void *new_pampd) -{ - struct tmem_obj *obj; - int ret = -1; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd); - ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj); -out: - spin_unlock(&hb->lock); - return ret; -} - -/* * "Flush" all pages in tmem matching this oid. */ int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp) @@ -721,7 +842,7 @@ int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp) obj = tmem_obj_find(hb, oidp); if (obj == NULL) goto out; - tmem_pampd_destroy_all_in_obj(obj); + tmem_pampd_destroy_all_in_obj(obj, false); tmem_obj_free(obj, hb); (*tmem_hostops.obj_free)(obj, pool); ret = 0; |