]> Pileus Git - ~andy/linux/blob - drivers/staging/zcache/tmem.c
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux...
[~andy/linux] / drivers / staging / zcache / tmem.c
1 /*
2  * In-kernel transcendent memory (generic implementation)
3  *
4  * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp.
5  *
6  * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
7  * "handles" (triples containing a pool id, and object id, and an index), to
8  * pages in a page-accessible memory (PAM).  Tmem references the PAM pages via
9  * an abstract "pampd" (PAM page-descriptor), which can be operated on by a
10  * set of functions (pamops).  Each pampd contains some representation of
11  * PAGE_SIZE bytes worth of data. For those familiar with key-value stores,
12  * the tmem handle is a three-level hierarchical key, and the value is always
13  * reconstituted (but not necessarily stored) as PAGE_SIZE bytes and is
14  * referenced in the datastore by the pampd.  The hierarchy is required
15  * to ensure that certain invalidation functions can be performed efficiently
16  * (i.e. flush all indexes associated with this object_id, or
17  * flush all objects associated with this pool).
18  *
19  * Tmem must support potentially millions of pages and must be able to insert,
20  * find, and delete these pages at a potential frequency of thousands per
21  * second concurrently across many CPUs, (and, if used with KVM, across many
22  * vcpus across many guests).  Tmem is tracked with a hierarchy of data
23  * structures, organized by the elements in the handle-tuple: pool_id,
24  * object_id, and page index.  One or more "clients" (e.g. guests) each
25  * provide one or more tmem_pools.  Each pool, contains a hash table of
26  * rb_trees of tmem_objs.  Each tmem_obj contains a radix-tree-like tree
27  * of pointers, with intermediate nodes called tmem_objnodes.  Each leaf
28  * pointer in this tree points to a pampd, which is accessible only through
29  * a small set of callbacks registered by the PAM implementation (see
30  * tmem_register_pamops). Tmem only needs to memory allocation for objs
31  * and objnodes and this is done via a set of callbacks that must be
32  * registered by the tmem host implementation (e.g. see tmem_register_hostops).
33  */
34
35 #include <linux/list.h>
36 #include <linux/spinlock.h>
37 #include <linux/atomic.h>
38 #include <linux/export.h>
39 #if defined(CONFIG_RAMSTER) || defined(CONFIG_RAMSTER_MODULE)
40 #include <linux/delay.h>
41 #endif
42
43 #include "tmem.h"
44
45 /* data structure sentinels used for debugging... see tmem.h */
46 #define POOL_SENTINEL 0x87658765
47 #define OBJ_SENTINEL 0x12345678
48 #define OBJNODE_SENTINEL 0xfedcba09
49
50 /*
51  * A tmem host implementation must use this function to register callbacks
52  * for memory allocation.
53  */
54 static struct tmem_hostops tmem_hostops;
55
56 static void tmem_objnode_tree_init(void);
57
58 void tmem_register_hostops(struct tmem_hostops *m)
59 {
60         tmem_objnode_tree_init();
61         tmem_hostops = *m;
62 }
63
64 /*
65  * A tmem host implementation must use this function to register
66  * callbacks for a page-accessible memory (PAM) implementation.
67  */
68 static struct tmem_pamops tmem_pamops;
69
70 void tmem_register_pamops(struct tmem_pamops *m)
71 {
72         tmem_pamops = *m;
73 }
74
75 /*
76  * Oid's are potentially very sparse and tmem_objs may have an indeterminately
77  * short life, being added and deleted at a relatively high frequency.
78  * So an rb_tree is an ideal data structure to manage tmem_objs.  But because
79  * of the potentially huge number of tmem_objs, each pool manages a hashtable
80  * of rb_trees to reduce search, insert, delete, and rebalancing time.
81  * Each hashbucket also has a lock to manage concurrent access and no
82  * searches, inserts, or deletions can be performed unless the lock is held.
83  * As a result, care must be taken to ensure tmem routines are not called
84  * recursively; the vast majority of the time, a recursive call may work
85  * but a deadlock will occur a small fraction of the time due to the
86  * hashbucket lock.
87  *
88  * The following routines manage tmem_objs.  In all of these routines,
89  * the hashbucket lock is already held.
90  */
91
92 /* Search for object==oid in pool, returns object if found. */
93 static struct tmem_obj *__tmem_obj_find(struct tmem_hashbucket *hb,
94                                         struct tmem_oid *oidp,
95                                         struct rb_node **parent,
96                                         struct rb_node ***link)
97 {
98         struct rb_node *_parent = NULL, **rbnode;
99         struct tmem_obj *obj = NULL;
100
101         rbnode = &hb->obj_rb_root.rb_node;
102         while (*rbnode) {
103                 BUG_ON(RB_EMPTY_NODE(*rbnode));
104                 _parent = *rbnode;
105                 obj = rb_entry(*rbnode, struct tmem_obj,
106                                rb_tree_node);
107                 switch (tmem_oid_compare(oidp, &obj->oid)) {
108                 case 0: /* equal */
109                         goto out;
110                 case -1:
111                         rbnode = &(*rbnode)->rb_left;
112                         break;
113                 case 1:
114                         rbnode = &(*rbnode)->rb_right;
115                         break;
116                 }
117         }
118
119         if (parent)
120                 *parent = _parent;
121         if (link)
122                 *link = rbnode;
123         obj = NULL;
124 out:
125         return obj;
126 }
127
128 static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
129                                         struct tmem_oid *oidp)
130 {
131         return __tmem_obj_find(hb, oidp, NULL, NULL);
132 }
133
134 static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *, bool);
135
136 /* Free an object that has no more pampds in it. */
137 static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
138 {
139         struct tmem_pool *pool;
140
141         BUG_ON(obj == NULL);
142         ASSERT_SENTINEL(obj, OBJ);
143         BUG_ON(obj->pampd_count > 0);
144         pool = obj->pool;
145         BUG_ON(pool == NULL);
146         if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
147                 tmem_pampd_destroy_all_in_obj(obj, false);
148         BUG_ON(obj->objnode_tree_root != NULL);
149         BUG_ON((long)obj->objnode_count != 0);
150         atomic_dec(&pool->obj_count);
151         BUG_ON(atomic_read(&pool->obj_count) < 0);
152         INVERT_SENTINEL(obj, OBJ);
153         obj->pool = NULL;
154         tmem_oid_set_invalid(&obj->oid);
155         rb_erase(&obj->rb_tree_node, &hb->obj_rb_root);
156 }
157
158 /*
159  * Initialize, and insert an tmem_object_root (called only if find failed).
160  */
161 static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
162                                         struct tmem_pool *pool,
163                                         struct tmem_oid *oidp)
164 {
165         struct rb_root *root = &hb->obj_rb_root;
166         struct rb_node **new = NULL, *parent = NULL;
167
168         BUG_ON(pool == NULL);
169         atomic_inc(&pool->obj_count);
170         obj->objnode_tree_height = 0;
171         obj->objnode_tree_root = NULL;
172         obj->pool = pool;
173         obj->oid = *oidp;
174         obj->objnode_count = 0;
175         obj->pampd_count = 0;
176 #ifdef CONFIG_RAMSTER
177         if (tmem_pamops.new_obj != NULL)
178                 (*tmem_pamops.new_obj)(obj);
179 #endif
180         SET_SENTINEL(obj, OBJ);
181
182         if (__tmem_obj_find(hb, oidp, &parent, &new))
183                 BUG();
184
185         rb_link_node(&obj->rb_tree_node, parent, new);
186         rb_insert_color(&obj->rb_tree_node, root);
187 }
188
189 /*
190  * Tmem is managed as a set of tmem_pools with certain attributes, such as
191  * "ephemeral" vs "persistent".  These attributes apply to all tmem_objs
192  * and all pampds that belong to a tmem_pool.  A tmem_pool is created
193  * or deleted relatively rarely (for example, when a filesystem is
194  * mounted or unmounted).
195  */
196
197 /* flush all data from a pool and, optionally, free it */
198 static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
199 {
200         struct rb_node *rbnode;
201         struct tmem_obj *obj;
202         struct tmem_hashbucket *hb = &pool->hashbucket[0];
203         int i;
204
205         BUG_ON(pool == NULL);
206         for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
207                 spin_lock(&hb->lock);
208                 rbnode = rb_first(&hb->obj_rb_root);
209                 while (rbnode != NULL) {
210                         obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
211                         rbnode = rb_next(rbnode);
212                         tmem_pampd_destroy_all_in_obj(obj, true);
213                         tmem_obj_free(obj, hb);
214                         (*tmem_hostops.obj_free)(obj, pool);
215                 }
216                 spin_unlock(&hb->lock);
217         }
218         if (destroy)
219                 list_del(&pool->pool_list);
220 }
221
222 /*
223  * A tmem_obj contains a radix-tree-like tree in which the intermediate
224  * nodes are called tmem_objnodes.  (The kernel lib/radix-tree.c implementation
225  * is very specialized and tuned for specific uses and is not particularly
226  * suited for use from this code, though some code from the core algorithms has
227  * been reused, thus the copyright notices below).  Each tmem_objnode contains
228  * a set of pointers which point to either a set of intermediate tmem_objnodes
229  * or a set of of pampds.
230  *
231  * Portions Copyright (C) 2001 Momchil Velikov
232  * Portions Copyright (C) 2001 Christoph Hellwig
233  * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
234  */
235
236 struct tmem_objnode_tree_path {
237         struct tmem_objnode *objnode;
238         int offset;
239 };
240
241 /* objnode height_to_maxindex translation */
242 static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1];
243
244 static void tmem_objnode_tree_init(void)
245 {
246         unsigned int ht, tmp;
247
248         for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) {
249                 tmp = ht * OBJNODE_TREE_MAP_SHIFT;
250                 if (tmp >= OBJNODE_TREE_INDEX_BITS)
251                         tmem_objnode_tree_h2max[ht] = ~0UL;
252                 else
253                         tmem_objnode_tree_h2max[ht] =
254                             (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1;
255         }
256 }
257
258 static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj)
259 {
260         struct tmem_objnode *objnode;
261
262         ASSERT_SENTINEL(obj, OBJ);
263         BUG_ON(obj->pool == NULL);
264         ASSERT_SENTINEL(obj->pool, POOL);
265         objnode = (*tmem_hostops.objnode_alloc)(obj->pool);
266         if (unlikely(objnode == NULL))
267                 goto out;
268         objnode->obj = obj;
269         SET_SENTINEL(objnode, OBJNODE);
270         memset(&objnode->slots, 0, sizeof(objnode->slots));
271         objnode->slots_in_use = 0;
272         obj->objnode_count++;
273 out:
274         return objnode;
275 }
276
277 static void tmem_objnode_free(struct tmem_objnode *objnode)
278 {
279         struct tmem_pool *pool;
280         int i;
281
282         BUG_ON(objnode == NULL);
283         for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++)
284                 BUG_ON(objnode->slots[i] != NULL);
285         ASSERT_SENTINEL(objnode, OBJNODE);
286         INVERT_SENTINEL(objnode, OBJNODE);
287         BUG_ON(objnode->obj == NULL);
288         ASSERT_SENTINEL(objnode->obj, OBJ);
289         pool = objnode->obj->pool;
290         BUG_ON(pool == NULL);
291         ASSERT_SENTINEL(pool, POOL);
292         objnode->obj->objnode_count--;
293         objnode->obj = NULL;
294         (*tmem_hostops.objnode_free)(objnode, pool);
295 }
296
297 /*
298  * Lookup index in object and return associated pampd (or NULL if not found).
299  */
300 static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
301 {
302         unsigned int height, shift;
303         struct tmem_objnode **slot = NULL;
304
305         BUG_ON(obj == NULL);
306         ASSERT_SENTINEL(obj, OBJ);
307         BUG_ON(obj->pool == NULL);
308         ASSERT_SENTINEL(obj->pool, POOL);
309
310         height = obj->objnode_tree_height;
311         if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height])
312                 goto out;
313         if (height == 0 && obj->objnode_tree_root) {
314                 slot = &obj->objnode_tree_root;
315                 goto out;
316         }
317         shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
318         slot = &obj->objnode_tree_root;
319         while (height > 0) {
320                 if (*slot == NULL)
321                         goto out;
322                 slot = (struct tmem_objnode **)
323                         ((*slot)->slots +
324                          ((index >> shift) & OBJNODE_TREE_MAP_MASK));
325                 shift -= OBJNODE_TREE_MAP_SHIFT;
326                 height--;
327         }
328 out:
329         return slot != NULL ? (void **)slot : NULL;
330 }
331
332 static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
333 {
334         struct tmem_objnode **slot;
335
336         slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
337         return slot != NULL ? *slot : NULL;
338 }
339
340 #ifdef CONFIG_RAMSTER
341 static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
342                                         void *new_pampd, bool no_free)
343 {
344         struct tmem_objnode **slot;
345         void *ret = NULL;
346
347         slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
348         if ((slot != NULL) && (*slot != NULL)) {
349                 void *old_pampd = *(void **)slot;
350                 *(void **)slot = new_pampd;
351                 if (!no_free)
352                         (*tmem_pamops.free)(old_pampd, obj->pool,
353                                                 NULL, 0, false);
354                 ret = new_pampd;
355         }
356         return ret;
357 }
358 #endif
359
360 static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
361                                         void *pampd)
362 {
363         int ret = 0;
364         struct tmem_objnode *objnode = NULL, *newnode, *slot;
365         unsigned int height, shift;
366         int offset = 0;
367
368         /* if necessary, extend the tree to be higher  */
369         if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) {
370                 height = obj->objnode_tree_height + 1;
371                 if (index > tmem_objnode_tree_h2max[height])
372                         while (index > tmem_objnode_tree_h2max[height])
373                                 height++;
374                 if (obj->objnode_tree_root == NULL) {
375                         obj->objnode_tree_height = height;
376                         goto insert;
377                 }
378                 do {
379                         newnode = tmem_objnode_alloc(obj);
380                         if (!newnode) {
381                                 ret = -ENOMEM;
382                                 goto out;
383                         }
384                         newnode->slots[0] = obj->objnode_tree_root;
385                         newnode->slots_in_use = 1;
386                         obj->objnode_tree_root = newnode;
387                         obj->objnode_tree_height++;
388                 } while (height > obj->objnode_tree_height);
389         }
390 insert:
391         slot = obj->objnode_tree_root;
392         height = obj->objnode_tree_height;
393         shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
394         while (height > 0) {
395                 if (slot == NULL) {
396                         /* add a child objnode.  */
397                         slot = tmem_objnode_alloc(obj);
398                         if (!slot) {
399                                 ret = -ENOMEM;
400                                 goto out;
401                         }
402                         if (objnode) {
403
404                                 objnode->slots[offset] = slot;
405                                 objnode->slots_in_use++;
406                         } else
407                                 obj->objnode_tree_root = slot;
408                 }
409                 /* go down a level */
410                 offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
411                 objnode = slot;
412                 slot = objnode->slots[offset];
413                 shift -= OBJNODE_TREE_MAP_SHIFT;
414                 height--;
415         }
416         BUG_ON(slot != NULL);
417         if (objnode) {
418                 objnode->slots_in_use++;
419                 objnode->slots[offset] = pampd;
420         } else
421                 obj->objnode_tree_root = pampd;
422         obj->pampd_count++;
423 out:
424         return ret;
425 }
426
427 static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index)
428 {
429         struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1];
430         struct tmem_objnode_tree_path *pathp = path;
431         struct tmem_objnode *slot = NULL;
432         unsigned int height, shift;
433         int offset;
434
435         BUG_ON(obj == NULL);
436         ASSERT_SENTINEL(obj, OBJ);
437         BUG_ON(obj->pool == NULL);
438         ASSERT_SENTINEL(obj->pool, POOL);
439         height = obj->objnode_tree_height;
440         if (index > tmem_objnode_tree_h2max[height])
441                 goto out;
442         slot = obj->objnode_tree_root;
443         if (height == 0 && obj->objnode_tree_root) {
444                 obj->objnode_tree_root = NULL;
445                 goto out;
446         }
447         shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT;
448         pathp->objnode = NULL;
449         do {
450                 if (slot == NULL)
451                         goto out;
452                 pathp++;
453                 offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
454                 pathp->offset = offset;
455                 pathp->objnode = slot;
456                 slot = slot->slots[offset];
457                 shift -= OBJNODE_TREE_MAP_SHIFT;
458                 height--;
459         } while (height > 0);
460         if (slot == NULL)
461                 goto out;
462         while (pathp->objnode) {
463                 pathp->objnode->slots[pathp->offset] = NULL;
464                 pathp->objnode->slots_in_use--;
465                 if (pathp->objnode->slots_in_use) {
466                         if (pathp->objnode == obj->objnode_tree_root) {
467                                 while (obj->objnode_tree_height > 0 &&
468                                   obj->objnode_tree_root->slots_in_use == 1 &&
469                                   obj->objnode_tree_root->slots[0]) {
470                                         struct tmem_objnode *to_free =
471                                                 obj->objnode_tree_root;
472
473                                         obj->objnode_tree_root =
474                                                         to_free->slots[0];
475                                         obj->objnode_tree_height--;
476                                         to_free->slots[0] = NULL;
477                                         to_free->slots_in_use = 0;
478                                         tmem_objnode_free(to_free);
479                                 }
480                         }
481                         goto out;
482                 }
483                 tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */
484                 pathp--;
485         }
486         obj->objnode_tree_height = 0;
487         obj->objnode_tree_root = NULL;
488
489 out:
490         if (slot != NULL)
491                 obj->pampd_count--;
492         BUG_ON(obj->pampd_count < 0);
493         return slot;
494 }
495
496 /* Recursively walk the objnode_tree destroying pampds and objnodes. */
497 static void tmem_objnode_node_destroy(struct tmem_obj *obj,
498                                         struct tmem_objnode *objnode,
499                                         unsigned int ht)
500 {
501         int i;
502
503         if (ht == 0)
504                 return;
505         for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) {
506                 if (objnode->slots[i]) {
507                         if (ht == 1) {
508                                 obj->pampd_count--;
509                                 (*tmem_pamops.free)(objnode->slots[i],
510                                                 obj->pool, NULL, 0, true);
511                                 objnode->slots[i] = NULL;
512                                 continue;
513                         }
514                         tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1);
515                         tmem_objnode_free(objnode->slots[i]);
516                         objnode->slots[i] = NULL;
517                 }
518         }
519 }
520
521 static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj,
522                                                 bool pool_destroy)
523 {
524         if (obj->objnode_tree_root == NULL)
525                 return;
526         if (obj->objnode_tree_height == 0) {
527                 obj->pampd_count--;
528                 (*tmem_pamops.free)(obj->objnode_tree_root,
529                                         obj->pool, NULL, 0, true);
530         } else {
531                 tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
532                                         obj->objnode_tree_height);
533                 tmem_objnode_free(obj->objnode_tree_root);
534                 obj->objnode_tree_height = 0;
535         }
536         obj->objnode_tree_root = NULL;
537 #ifdef CONFIG_RAMSTER
538         if (tmem_pamops.free_obj != NULL)
539                 (*tmem_pamops.free_obj)(obj->pool, obj, pool_destroy);
540 #endif
541 }
542
543 /*
544  * Tmem is operated on by a set of well-defined actions:
545  * "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
546  * (The tmem ABI allows for subpages and exchanges but these operations
547  * are not included in this implementation.)
548  *
549  * These "tmem core" operations are implemented in the following functions.
550  */
551
552 /*
553  * "Put" a page, e.g. associate the passed pampd with the passed handle.
554  * Tmem_put is complicated by a corner case: What if a page with matching
555  * handle already exists in tmem?  To guarantee coherency, one of two
556  * actions is necessary: Either the data for the page must be overwritten,
557  * or the page must be "flushed" so that the data is not accessible to a
558  * subsequent "get".  Since these "duplicate puts" are relatively rare,
559  * this implementation always flushes for simplicity.
560  */
561 int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
562                 bool raw, void *pampd_to_use)
563 {
564         struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
565         void *pampd = NULL, *pampd_del = NULL;
566         int ret = -ENOMEM;
567         struct tmem_hashbucket *hb;
568
569         hb = &pool->hashbucket[tmem_oid_hash(oidp)];
570         spin_lock(&hb->lock);
571         obj = objfound = tmem_obj_find(hb, oidp);
572         if (obj != NULL) {
573                 pampd = tmem_pampd_lookup_in_obj(objfound, index);
574                 if (pampd != NULL) {
575                         /* if found, is a dup put, flush the old one */
576                         pampd_del = tmem_pampd_delete_from_obj(obj, index);
577                         BUG_ON(pampd_del != pampd);
578                         (*tmem_pamops.free)(pampd, pool, oidp, index, true);
579                         if (obj->pampd_count == 0) {
580                                 objnew = obj;
581                                 objfound = NULL;
582                         }
583                         pampd = NULL;
584                 }
585         } else {
586                 obj = objnew = (*tmem_hostops.obj_alloc)(pool);
587                 if (unlikely(obj == NULL)) {
588                         ret = -ENOMEM;
589                         goto out;
590                 }
591                 tmem_obj_init(obj, hb, pool, oidp);
592         }
593         BUG_ON(obj == NULL);
594         BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
595         pampd = pampd_to_use;
596         BUG_ON(pampd_to_use == NULL);
597         ret = tmem_pampd_add_to_obj(obj, index, pampd);
598         if (unlikely(ret == -ENOMEM))
599                 /* may have partially built objnode tree ("stump") */
600                 goto delete_and_free;
601         (*tmem_pamops.create_finish)(pampd, is_ephemeral(pool));
602         goto out;
603
604 delete_and_free:
605         (void)tmem_pampd_delete_from_obj(obj, index);
606         if (pampd)
607                 (*tmem_pamops.free)(pampd, pool, NULL, 0, true);
608         if (objnew) {
609                 tmem_obj_free(objnew, hb);
610                 (*tmem_hostops.obj_free)(objnew, pool);
611         }
612 out:
613         spin_unlock(&hb->lock);
614         return ret;
615 }
616
617 #ifdef CONFIG_RAMSTER
618 /*
619  * For ramster only:  The following routines provide a two-step sequence
620  * to allow the caller to replace a pampd in the tmem data structures with
621  * another pampd. Here, we lookup the passed handle and, if found, return the
622  * associated pampd and object, leaving the hashbucket locked and returning
623  * a reference to it.  The caller is expected to immediately call the
624  * matching tmem_localify_finish routine which will handles the replacement
625  * and unlocks the hashbucket.
626  */
627 void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp,
628                                 uint32_t index, struct tmem_obj **ret_obj,
629                                 void **saved_hb)
630 {
631         struct tmem_hashbucket *hb;
632         struct tmem_obj *obj = NULL;
633         void *pampd = NULL;
634
635         hb = &pool->hashbucket[tmem_oid_hash(oidp)];
636         spin_lock(&hb->lock);
637         obj = tmem_obj_find(hb, oidp);
638         if (likely(obj != NULL))
639                 pampd = tmem_pampd_lookup_in_obj(obj, index);
640         *ret_obj = obj;
641         *saved_hb = (void *)hb;
642         /* note, hashbucket remains locked */
643         return pampd;
644 }
645 EXPORT_SYMBOL_GPL(tmem_localify_get_pampd);
646
647 void tmem_localify_finish(struct tmem_obj *obj, uint32_t index,
648                           void *pampd, void *saved_hb, bool delete)
649 {
650         struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb;
651
652         BUG_ON(!spin_is_locked(&hb->lock));
653         if (pampd != NULL) {
654                 BUG_ON(obj == NULL);
655                 (void)tmem_pampd_replace_in_obj(obj, index, pampd, 1);
656                 (*tmem_pamops.create_finish)(pampd, is_ephemeral(obj->pool));
657         } else if (delete) {
658                 BUG_ON(obj == NULL);
659                 (void)tmem_pampd_delete_from_obj(obj, index);
660         }
661         spin_unlock(&hb->lock);
662 }
663 EXPORT_SYMBOL_GPL(tmem_localify_finish);
664
665 /*
666  * For ramster only.  Helper function to support asynchronous tmem_get.
667  */
668 static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb,
669                                 struct tmem_pool *pool, struct tmem_oid *oidp,
670                                 uint32_t index, bool free, char *data)
671 {
672         void *old_pampd = *ppampd, *new_pampd = NULL;
673         bool intransit = false;
674         int ret = 0;
675
676         if (!is_ephemeral(pool))
677                 new_pampd = (*tmem_pamops.repatriate_preload)(
678                                 old_pampd, pool, oidp, index, &intransit);
679         if (intransit)
680                 ret = -EAGAIN;
681         else if (new_pampd != NULL)
682                 *ppampd = new_pampd;
683         /* must release the hb->lock else repatriate can't sleep */
684         spin_unlock(&hb->lock);
685         if (!intransit)
686                 ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool,
687                                                 oidp, index, free, data);
688         if (ret == -EAGAIN) {
689                 /* rare I think, but should cond_resched()??? */
690                 usleep_range(10, 1000);
691         } else if (ret == -ENOTCONN || ret == -EHOSTDOWN) {
692                 ret = -1;
693         } else if (ret != 0 && ret != -ENOENT) {
694                 ret = -1;
695         }
696         /* note hb->lock has now been unlocked */
697         return ret;
698 }
699
700 /*
701  * For ramster only.  If a page in tmem matches the handle, replace the
702  * page so that any subsequent "get" gets the new page.  Returns 0 if
703  * there was a page to replace, else returns -1.
704  */
705 int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
706                         uint32_t index, void *new_pampd)
707 {
708         struct tmem_obj *obj;
709         int ret = -1;
710         struct tmem_hashbucket *hb;
711
712         hb = &pool->hashbucket[tmem_oid_hash(oidp)];
713         spin_lock(&hb->lock);
714         obj = tmem_obj_find(hb, oidp);
715         if (obj == NULL)
716                 goto out;
717         new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0);
718         /* if we bug here, pamops wasn't properly set up for ramster */
719         BUG_ON(tmem_pamops.replace_in_obj == NULL);
720         ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
721 out:
722         spin_unlock(&hb->lock);
723         return ret;
724 }
725 EXPORT_SYMBOL_GPL(tmem_replace);
726 #endif
727
728 /*
729  * "Get" a page, e.g. if a pampd can be found matching the passed handle,
730  * use a pamops callback to recreated the page from the pampd with the
731  * matching handle.  By tmem definition, when a "get" is successful on
732  * an ephemeral page, the page is "flushed", and when a "get" is successful
733  * on a persistent page, the page is retained in tmem.  Note that to preserve
734  * coherency, "get" can never be skipped if tmem contains the data.
735  * That is, if a get is done with a certain handle and fails, any
736  * subsequent "get" must also fail (unless of course there is a
737  * "put" done with the same handle).
738  */
739 int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
740                 char *data, size_t *sizep, bool raw, int get_and_free)
741 {
742         struct tmem_obj *obj;
743         void *pampd = NULL;
744         bool ephemeral = is_ephemeral(pool);
745         int ret = -1;
746         struct tmem_hashbucket *hb;
747         bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
748         bool lock_held = false;
749         void **ppampd;
750
751         do {
752                 hb = &pool->hashbucket[tmem_oid_hash(oidp)];
753                 spin_lock(&hb->lock);
754                 lock_held = true;
755                 obj = tmem_obj_find(hb, oidp);
756                 if (obj == NULL)
757                         goto out;
758                 ppampd = __tmem_pampd_lookup_in_obj(obj, index);
759                 if (ppampd == NULL)
760                         goto out;
761 #ifdef CONFIG_RAMSTER
762                 if ((tmem_pamops.is_remote != NULL) &&
763                      tmem_pamops.is_remote(*ppampd)) {
764                         ret = tmem_repatriate(ppampd, hb, pool, oidp,
765                                                 index, free, data);
766                         /* tmem_repatriate releases hb->lock */
767                         lock_held = false;
768                         *sizep = PAGE_SIZE;
769                         if (ret != -EAGAIN)
770                                 goto out;
771                 }
772 #endif
773         } while (ret == -EAGAIN);
774         if (free)
775                 pampd = tmem_pampd_delete_from_obj(obj, index);
776         else
777                 pampd = tmem_pampd_lookup_in_obj(obj, index);
778         if (pampd == NULL)
779                 goto out;
780         if (free) {
781                 if (obj->pampd_count == 0) {
782                         tmem_obj_free(obj, hb);
783                         (*tmem_hostops.obj_free)(obj, pool);
784                         obj = NULL;
785                 }
786         }
787         if (free)
788                 ret = (*tmem_pamops.get_data_and_free)(
789                                 data, sizep, raw, pampd, pool, oidp, index);
790         else
791                 ret = (*tmem_pamops.get_data)(
792                                 data, sizep, raw, pampd, pool, oidp, index);
793         if (ret < 0)
794                 goto out;
795         ret = 0;
796 out:
797         if (lock_held)
798                 spin_unlock(&hb->lock);
799         return ret;
800 }
801
802 /*
803  * If a page in tmem matches the handle, "flush" this page from tmem such
804  * that any subsequent "get" does not succeed (unless, of course, there
805  * was another "put" with the same handle).
806  */
807 int tmem_flush_page(struct tmem_pool *pool,
808                                 struct tmem_oid *oidp, uint32_t index)
809 {
810         struct tmem_obj *obj;
811         void *pampd;
812         int ret = -1;
813         struct tmem_hashbucket *hb;
814
815         hb = &pool->hashbucket[tmem_oid_hash(oidp)];
816         spin_lock(&hb->lock);
817         obj = tmem_obj_find(hb, oidp);
818         if (obj == NULL)
819                 goto out;
820         pampd = tmem_pampd_delete_from_obj(obj, index);
821         if (pampd == NULL)
822                 goto out;
823         (*tmem_pamops.free)(pampd, pool, oidp, index, true);
824         if (obj->pampd_count == 0) {
825                 tmem_obj_free(obj, hb);
826                 (*tmem_hostops.obj_free)(obj, pool);
827         }
828         ret = 0;
829
830 out:
831         spin_unlock(&hb->lock);
832         return ret;
833 }
834
835 /*
836  * "Flush" all pages in tmem matching this oid.
837  */
838 int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
839 {
840         struct tmem_obj *obj;
841         struct tmem_hashbucket *hb;
842         int ret = -1;
843
844         hb = &pool->hashbucket[tmem_oid_hash(oidp)];
845         spin_lock(&hb->lock);
846         obj = tmem_obj_find(hb, oidp);
847         if (obj == NULL)
848                 goto out;
849         tmem_pampd_destroy_all_in_obj(obj, false);
850         tmem_obj_free(obj, hb);
851         (*tmem_hostops.obj_free)(obj, pool);
852         ret = 0;
853
854 out:
855         spin_unlock(&hb->lock);
856         return ret;
857 }
858
859 /*
860  * "Flush" all pages (and tmem_objs) from this tmem_pool and disable
861  * all subsequent access to this tmem_pool.
862  */
863 int tmem_destroy_pool(struct tmem_pool *pool)
864 {
865         int ret = -1;
866
867         if (pool == NULL)
868                 goto out;
869         tmem_pool_flush(pool, 1);
870         ret = 0;
871 out:
872         return ret;
873 }
874
875 static LIST_HEAD(tmem_global_pool_list);
876
877 /*
878  * Create a new tmem_pool with the provided flag and return
879  * a pool id provided by the tmem host implementation.
880  */
881 void tmem_new_pool(struct tmem_pool *pool, uint32_t flags)
882 {
883         int persistent = flags & TMEM_POOL_PERSIST;
884         int shared = flags & TMEM_POOL_SHARED;
885         struct tmem_hashbucket *hb = &pool->hashbucket[0];
886         int i;
887
888         for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
889                 hb->obj_rb_root = RB_ROOT;
890                 spin_lock_init(&hb->lock);
891         }
892         INIT_LIST_HEAD(&pool->pool_list);
893         atomic_set(&pool->obj_count, 0);
894         SET_SENTINEL(pool, POOL);
895         list_add_tail(&pool->pool_list, &tmem_global_pool_list);
896         pool->persistent = persistent;
897         pool->shared = shared;
898 }