Pileus Git - ~andy/linux/blob - drivers/staging/zcache/zbud.c

   1 /*
   2  * zbud.c - Compression buddies allocator
   3  *
   4  * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
   5  *
   6  * Compression buddies ("zbud") provides for efficiently packing two
   7  * (or, possibly in the future, more) compressed pages ("zpages") into
   8  * a single "raw" pageframe and for tracking both zpages and pageframes
   9  * so that whole pageframes can be easily reclaimed in LRU-like order.
  10  * It is designed to be used in conjunction with transcendent memory
  11  * ("tmem"); for example separate LRU lists are maintained for persistent
  12  * vs. ephemeral pages.
  13  *
  14  * A zbudpage is an overlay for a struct page and thus each zbudpage
  15  * refers to a physical pageframe of RAM.  When the caller passes a
  16  * struct page from the kernel's page allocator, zbud "transforms" it
  17  * to a zbudpage which sets/uses a different set of fields than the
  18  * struct-page and thus must "untransform" it back by reinitializing
  19  * certain fields before the struct-page can be freed.  The fields
  20  * of a zbudpage include a page lock for controlling access to the
  21  * corresponding pageframe, and there is a size field for each zpage.
  22  * Each zbudpage also lives on two linked lists: a "budlist" which is
  23  * used to support efficient buddying of zpages; and an "lru" which
  24  * is used for reclaiming pageframes in approximately least-recently-used
  25  * order.
  26  *
  27  * A zbudpageframe is a pageframe divided up into aligned 64-byte "chunks"
  28  * which contain the compressed data for zero, one, or two zbuds.  Contained
  29  * with the compressed data is a tmem_handle which is a key to allow
  30  * the same data to be found via the tmem interface so the zpage can
  31  * be invalidated (for ephemeral pages) or repatriated to the swap cache
  32  * (for persistent pages).  The contents of a zbudpageframe must never
  33  * be accessed without holding the page lock for the corresponding
  34  * zbudpage and, to accomodate highmem machines, the contents may
  35  * only be examined or changes when kmapped.  Thus, when in use, a
  36  * kmapped zbudpageframe is referred to in the zbud code as "void *zbpg".
  37  *
  38  * Note that the term "zbud" refers to the combination of a zpage and
  39  * a tmem_handle that is stored as one of possibly two "buddied" zpages;
  40  * it also generically refers to this allocator... sorry for any confusion.
  41  *
  42  * A zbudref is a pointer to a struct zbudpage (which can be cast to a
  43  * struct page), with the LSB either cleared or set to indicate, respectively,
  44  * the first or second zpage in the zbudpageframe. Since a zbudref can be
  45  * cast to a pointer, it is used as the tmem "pampd" pointer and uniquely
  46  * references a stored tmem page and so is the only zbud data structure
  47  * externally visible to zbud.c/zbud.h.
  48  *
  49  * Since we wish to reclaim entire pageframes but zpages may be randomly
  50  * added and deleted to any given pageframe, we approximate LRU by
  51  * promoting a pageframe to MRU when a zpage is added to it, but
  52  * leaving it at the current place in the list when a zpage is deleted
  53  * from it.  As a side effect, zpages that are difficult to buddy (e.g.
  54  * very large paages) will be reclaimed faster than average, which seems
  55  * reasonable.
  56  *
  57  * In the current implementation, no more than two zpages may be stored in
  58  * any pageframe and no zpage ever crosses a pageframe boundary.  While
  59  * other zpage allocation mechanisms may allow greater density, this two
  60  * zpage-per-pageframe limit both ensures simple reclaim of pageframes
  61  * (including garbage collection of references to the contents of those
  62  * pageframes from tmem data structures) AND avoids the need for compaction.
  63  * With additional complexity, zbud could be modified to support storing
  64  * up to three zpages per pageframe or, to handle larger average zpages,
  65  * up to three zpages per pair of pageframes, but it is not clear if the
  66  * additional complexity would be worth it.  So consider it an exercise
  67  * for future developers.
  68  *
  69  * Note also that zbud does no page allocation or freeing.  This is so
  70  * that the caller has complete control over and, for accounting, visibility
  71  * into if/when pages are allocated and freed.
  72  *
  73  * Finally, note that zbud limits the size of zpages it can store; the
  74  * caller must check the zpage size with zbud_max_buddy_size before
  75  * storing it, else BUGs will result.  User beware.
  76  */
  77
  78 #include <linux/module.h>
  79 #include <linux/highmem.h>
  80 #include <linux/list.h>
  81 #include <linux/spinlock.h>
  82 #include <linux/pagemap.h>
  83 #include <linux/atomic.h>
  84 #include <linux/bug.h>
  85 #include "tmem.h"
  86 #include "zcache.h"
  87 #include "zbud.h"
  88
  89 /*
  90  * We need to ensure that a struct zbudpage is never larger than a
  91  * struct page.  This is checked with a BUG_ON in zbud_init.
  92  *
  93  * The unevictable field indicates that a zbud is being added to the
  94  * zbudpage.  Since this is a two-phase process (due to tmem locking),
  95  * this field locks the zbudpage against eviction when a zbud match
  96  * or creation is in process.  Since this addition process may occur
  97  * in parallel for two zbuds in one zbudpage, the field is a counter
  98  * that must not exceed two.
  99  */
 100 struct zbudpage {
 101         union {
 102                 struct page page;
 103                 struct {
 104                         unsigned long space_for_flags;
 105                         struct {
 106                                 unsigned zbud0_size:PAGE_SHIFT;
 107                                 unsigned zbud1_size:PAGE_SHIFT;
 108                                 unsigned unevictable:2;
 109                         };
 110                         struct list_head budlist;
 111                         struct list_head lru;
 112                 };
 113         };
 114 };
 115 #if (PAGE_SHIFT * 2) + 2 > BITS_PER_LONG
 116 #error "zbud won't work for this arch, PAGE_SIZE is too large"
 117 #endif
 118
 119 struct zbudref {
 120         union {
 121                 struct zbudpage *zbudpage;
 122                 unsigned long zbudref;
 123         };
 124 };
 125
 126 #define CHUNK_SHIFT     6
 127 #define CHUNK_SIZE      (1 << CHUNK_SHIFT)
 128 #define CHUNK_MASK      (~(CHUNK_SIZE-1))
 129 #define NCHUNKS         (PAGE_SIZE >> CHUNK_SHIFT)
 130 #define MAX_CHUNK       (NCHUNKS-1)
 131
 132 /*
 133  * The following functions deal with the difference between struct
 134  * page and struct zbudpage.  Note the hack of using the pageflags
 135  * from struct page; this is to avoid duplicating all the complex
 136  * pageflag macros.
 137  */
 138 static inline void zbudpage_spin_lock(struct zbudpage *zbudpage)
 139 {
 140         struct page *page = (struct page *)zbudpage;
 141
 142         while (unlikely(test_and_set_bit_lock(PG_locked, &page->flags))) {
 143                 do {
 144                         cpu_relax();
 145                 } while (test_bit(PG_locked, &page->flags));
 146         }
 147 }
 148
 149 static inline void zbudpage_spin_unlock(struct zbudpage *zbudpage)
 150 {
 151         struct page *page = (struct page *)zbudpage;
 152
 153         clear_bit(PG_locked, &page->flags);
 154 }
 155
 156 static inline int zbudpage_spin_trylock(struct zbudpage *zbudpage)
 157 {
 158         return trylock_page((struct page *)zbudpage);
 159 }
 160
 161 static inline int zbudpage_is_locked(struct zbudpage *zbudpage)
 162 {
 163         return PageLocked((struct page *)zbudpage);
 164 }
 165
 166 static inline void *kmap_zbudpage_atomic(struct zbudpage *zbudpage)
 167 {
 168         return kmap_atomic((struct page *)zbudpage);
 169 }
 170
 171 /*
 172  * A dying zbudpage is an ephemeral page in the process of being evicted.
 173  * Any data contained in the zbudpage is invalid and we are just waiting for
 174  * the tmem pampds to be invalidated before freeing the page
 175  */
 176 static inline int zbudpage_is_dying(struct zbudpage *zbudpage)
 177 {
 178         struct page *page = (struct page *)zbudpage;
 179
 180         return test_bit(PG_reclaim, &page->flags);
 181 }
 182
 183 static inline void zbudpage_set_dying(struct zbudpage *zbudpage)
 184 {
 185         struct page *page = (struct page *)zbudpage;
 186
 187         set_bit(PG_reclaim, &page->flags);
 188 }
 189
 190 static inline void zbudpage_clear_dying(struct zbudpage *zbudpage)
 191 {
 192         struct page *page = (struct page *)zbudpage;
 193
 194         clear_bit(PG_reclaim, &page->flags);
 195 }
 196
 197 /*
 198  * A zombie zbudpage is a persistent page in the process of being evicted.
 199  * The data contained in the zbudpage is valid and we are just waiting for
 200  * the tmem pampds to be invalidated before freeing the page
 201  */
 202 static inline int zbudpage_is_zombie(struct zbudpage *zbudpage)
 203 {
 204         struct page *page = (struct page *)zbudpage;
 205
 206         return test_bit(PG_dirty, &page->flags);
 207 }
 208
 209 static inline void zbudpage_set_zombie(struct zbudpage *zbudpage)
 210 {
 211         struct page *page = (struct page *)zbudpage;
 212
 213         set_bit(PG_dirty, &page->flags);
 214 }
 215
 216 static inline void zbudpage_clear_zombie(struct zbudpage *zbudpage)
 217 {
 218         struct page *page = (struct page *)zbudpage;
 219
 220         clear_bit(PG_dirty, &page->flags);
 221 }
 222
 223 static inline void kunmap_zbudpage_atomic(void *zbpg)
 224 {
 225         kunmap_atomic(zbpg);
 226 }
 227
 228 /*
 229  * zbud "translation" and helper functions
 230  */
 231
 232 static inline struct zbudpage *zbudref_to_zbudpage(struct zbudref *zref)
 233 {
 234         unsigned long zbud = (unsigned long)zref;
 235         zbud &= ~1UL;
 236         return (struct zbudpage *)zbud;
 237 }
 238
 239 static inline struct zbudref *zbudpage_to_zbudref(struct zbudpage *zbudpage,
 240                                                         unsigned budnum)
 241 {
 242         unsigned long zbud = (unsigned long)zbudpage;
 243         BUG_ON(budnum > 1);
 244         zbud |= budnum;
 245         return (struct zbudref *)zbud;
 246 }
 247
 248 static inline int zbudref_budnum(struct zbudref *zbudref)
 249 {
 250         unsigned long zbud = (unsigned long)zbudref;
 251         return zbud & 1UL;
 252 }
 253
 254 static inline unsigned zbud_max_size(void)
 255 {
 256         return MAX_CHUNK << CHUNK_SHIFT;
 257 }
 258
 259 static inline unsigned zbud_size_to_chunks(unsigned size)
 260 {
 261         BUG_ON(size == 0 || size > zbud_max_size());
 262         return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
 263 }
 264
 265 /* can only be used between kmap_zbudpage_atomic/kunmap_zbudpage_atomic! */
 266 static inline char *zbud_data(void *zbpg,
 267                         unsigned budnum, unsigned size)
 268 {
 269         char *p;
 270
 271         BUG_ON(size == 0 || size > zbud_max_size());
 272         p = (char *)zbpg;
 273         if (budnum == 1)
 274                 p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
 275         return p;
 276 }
 277
 278 /*
 279  * These are all informative and exposed through debugfs... except for
 280  * the arrays... anyone know how to do that?  To avoid confusion for
 281  * debugfs viewers, some of these should also be atomic_long_t, but
 282  * I don't know how to expose atomics via debugfs either...
 283  */
 284 static ssize_t zbud_eph_pageframes;
 285 static ssize_t zbud_pers_pageframes;
 286 static ssize_t zbud_eph_zpages;
 287 static ssize_t zbud_pers_zpages;
 288 static u64 zbud_eph_zbytes;
 289 static u64 zbud_pers_zbytes;
 290 static ssize_t zbud_eph_evicted_pageframes;
 291 static ssize_t zbud_pers_evicted_pageframes;
 292 static ssize_t zbud_eph_cumul_zpages;
 293 static ssize_t zbud_pers_cumul_zpages;
 294 static u64 zbud_eph_cumul_zbytes;
 295 static u64 zbud_pers_cumul_zbytes;
 296 static ssize_t zbud_eph_cumul_chunk_counts[NCHUNKS];
 297 static ssize_t zbud_pers_cumul_chunk_counts[NCHUNKS];
 298 static ssize_t zbud_eph_buddied_count;
 299 static ssize_t zbud_pers_buddied_count;
 300 static ssize_t zbud_eph_unbuddied_count;
 301 static ssize_t zbud_pers_unbuddied_count;
 302 static ssize_t zbud_eph_zombie_count;
 303 static ssize_t zbud_pers_zombie_count;
 304 static atomic_t zbud_eph_zombie_atomic;
 305 static atomic_t zbud_pers_zombie_atomic;
 306
 307 #ifdef CONFIG_DEBUG_FS
 308 #include <linux/debugfs.h>
 309 #define zdfs    debugfs_create_size_t
 310 #define zdfs64  debugfs_create_u64
 311 static int zbud_debugfs_init(void)
 312 {
 313         struct dentry *root = debugfs_create_dir("zbud", NULL);
 314         if (root == NULL)
 315                 return -ENXIO;
 316
 317         /*
 318          * would be nice to dump the sizes of the unbuddied
 319          * arrays, like was done with sysfs, but it doesn't
 320          * look like debugfs is flexible enough to do that
 321          */
 322         zdfs64("eph_zbytes", S_IRUGO, root, &zbud_eph_zbytes);
 323         zdfs64("eph_cumul_zbytes", S_IRUGO, root, &zbud_eph_cumul_zbytes);
 324         zdfs64("pers_zbytes", S_IRUGO, root, &zbud_pers_zbytes);
 325         zdfs64("pers_cumul_zbytes", S_IRUGO, root, &zbud_pers_cumul_zbytes);
 326         zdfs("eph_cumul_zpages", S_IRUGO, root, &zbud_eph_cumul_zpages);
 327         zdfs("eph_evicted_pageframes", S_IRUGO, root,
 328                                 &zbud_eph_evicted_pageframes);
 329         zdfs("eph_zpages", S_IRUGO, root, &zbud_eph_zpages);
 330         zdfs("eph_pageframes", S_IRUGO, root, &zbud_eph_pageframes);
 331         zdfs("eph_buddied_count", S_IRUGO, root, &zbud_eph_buddied_count);
 332         zdfs("eph_unbuddied_count", S_IRUGO, root, &zbud_eph_unbuddied_count);
 333         zdfs("pers_cumul_zpages", S_IRUGO, root, &zbud_pers_cumul_zpages);
 334         zdfs("pers_evicted_pageframes", S_IRUGO, root,
 335                                 &zbud_pers_evicted_pageframes);
 336         zdfs("pers_zpages", S_IRUGO, root, &zbud_pers_zpages);
 337         zdfs("pers_pageframes", S_IRUGO, root, &zbud_pers_pageframes);
 338         zdfs("pers_buddied_count", S_IRUGO, root, &zbud_pers_buddied_count);
 339         zdfs("pers_unbuddied_count", S_IRUGO, root, &zbud_pers_unbuddied_count);
 340         zdfs("pers_zombie_count", S_IRUGO, root, &zbud_pers_zombie_count);
 341         return 0;
 342 }
 343 #undef  zdfs
 344 #undef  zdfs64
 345 #else
 346 static inline int zbud_debugfs_init(void)
 347 {
 348         return 0;
 349 }
 350 #endif
 351
 352 /* protects the buddied list and all unbuddied lists */
 353 static DEFINE_SPINLOCK(zbud_eph_lists_lock);
 354 static DEFINE_SPINLOCK(zbud_pers_lists_lock);
 355
 356 struct zbud_unbuddied {
 357         struct list_head list;
 358         unsigned count;
 359 };
 360
 361 /* list N contains pages with N chunks USED and NCHUNKS-N unused */
 362 /* element 0 is never used but optimizing that isn't worth it */
 363 static struct zbud_unbuddied zbud_eph_unbuddied[NCHUNKS];
 364 static struct zbud_unbuddied zbud_pers_unbuddied[NCHUNKS];
 365 static LIST_HEAD(zbud_eph_lru_list);
 366 static LIST_HEAD(zbud_pers_lru_list);
 367 static LIST_HEAD(zbud_eph_buddied_list);
 368 static LIST_HEAD(zbud_pers_buddied_list);
 369 static LIST_HEAD(zbud_eph_zombie_list);
 370 static LIST_HEAD(zbud_pers_zombie_list);
 371
 372 /*
 373  * Given a struct page, transform it to a zbudpage so that it can be
 374  * used by zbud and initialize fields as necessary.
 375  */
 376 static inline struct zbudpage *zbud_init_zbudpage(struct page *page, bool eph)
 377 {
 378         struct zbudpage *zbudpage = (struct zbudpage *)page;
 379
 380         BUG_ON(page == NULL);
 381         INIT_LIST_HEAD(&zbudpage->budlist);
 382         INIT_LIST_HEAD(&zbudpage->lru);
 383         zbudpage->zbud0_size = 0;
 384         zbudpage->zbud1_size = 0;
 385         zbudpage->unevictable = 0;
 386         if (eph)
 387                 zbud_eph_pageframes++;
 388         else
 389                 zbud_pers_pageframes++;
 390         return zbudpage;
 391 }
 392
 393 /* "Transform" a zbudpage back to a struct page suitable to free. */
 394 static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage,
 395                                                                 bool eph)
 396 {
 397         struct page *page = (struct page *)zbudpage;
 398
 399         BUG_ON(!list_empty(&zbudpage->budlist));
 400         BUG_ON(!list_empty(&zbudpage->lru));
 401         BUG_ON(zbudpage->zbud0_size != 0);
 402         BUG_ON(zbudpage->zbud1_size != 0);
 403         BUG_ON(!PageLocked(page));
 404         BUG_ON(zbudpage->unevictable != 0);
 405         BUG_ON(zbudpage_is_dying(zbudpage));
 406         BUG_ON(zbudpage_is_zombie(zbudpage));
 407         if (eph)
 408                 zbud_eph_pageframes--;
 409         else
 410                 zbud_pers_pageframes--;
 411         zbudpage_spin_unlock(zbudpage);
 412         page_mapcount_reset(page);
 413         init_page_count(page);
 414         page->index = 0;
 415         return page;
 416 }
 417
 418 /* Mark a zbud as unused and do accounting */
 419 static inline void zbud_unuse_zbud(struct zbudpage *zbudpage,
 420                                         int budnum, bool eph)
 421 {
 422         unsigned size;
 423
 424         BUG_ON(!zbudpage_is_locked(zbudpage));
 425         if (budnum == 0) {
 426                 size = zbudpage->zbud0_size;
 427                 zbudpage->zbud0_size = 0;
 428         } else {
 429                 size = zbudpage->zbud1_size;
 430                 zbudpage->zbud1_size = 0;
 431         }
 432         if (eph) {
 433                 zbud_eph_zbytes -= size;
 434                 zbud_eph_zpages--;
 435         } else {
 436                 zbud_pers_zbytes -= size;
 437                 zbud_pers_zpages--;
 438         }
 439 }
 440
 441 /*
 442  * Given a zbudpage/budnum/size, a tmem handle, and a kmapped pointer
 443  * to some data, set up the zbud appropriately including data copying
 444  * and accounting.  Note that if cdata is NULL, the data copying is
 445  * skipped.  (This is useful for lazy writes such as for RAMster.)
 446  */
 447 static void zbud_init_zbud(struct zbudpage *zbudpage, struct tmem_handle *th,
 448                                 bool eph, void *cdata,
 449                                 unsigned budnum, unsigned size)
 450 {
 451         char *to;
 452         void *zbpg;
 453         struct tmem_handle *to_th;
 454         unsigned nchunks = zbud_size_to_chunks(size);
 455
 456         BUG_ON(!zbudpage_is_locked(zbudpage));
 457         zbpg = kmap_zbudpage_atomic(zbudpage);
 458         to = zbud_data(zbpg, budnum, size);
 459         to_th = (struct tmem_handle *)to;
 460         to_th->index = th->index;
 461         to_th->oid = th->oid;
 462         to_th->pool_id = th->pool_id;
 463         to_th->client_id = th->client_id;
 464         to += sizeof(struct tmem_handle);
 465         if (cdata != NULL)
 466                 memcpy(to, cdata, size - sizeof(struct tmem_handle));
 467         kunmap_zbudpage_atomic(zbpg);
 468         if (budnum == 0)
 469                 zbudpage->zbud0_size = size;
 470         else
 471                 zbudpage->zbud1_size = size;
 472         if (eph) {
 473                 zbud_eph_cumul_chunk_counts[nchunks]++;
 474                 zbud_eph_zpages++;
 475                 zbud_eph_cumul_zpages++;
 476                 zbud_eph_zbytes += size;
 477                 zbud_eph_cumul_zbytes += size;
 478         } else {
 479                 zbud_pers_cumul_chunk_counts[nchunks]++;
 480                 zbud_pers_zpages++;
 481                 zbud_pers_cumul_zpages++;
 482                 zbud_pers_zbytes += size;
 483                 zbud_pers_cumul_zbytes += size;
 484         }
 485 }
 486
 487 /*
 488  * Given a locked dying zbudpage, read out the tmem handles from the data,
 489  * unlock the page, then use the handles to tell tmem to flush out its
 490  * references
 491  */
 492 static void zbud_evict_tmem(struct zbudpage *zbudpage)
 493 {
 494         int i, j;
 495         uint32_t pool_id[2], client_id[2];
 496         uint32_t index[2];
 497         struct tmem_oid oid[2];
 498         struct tmem_pool *pool;
 499         void *zbpg;
 500         struct tmem_handle *th;
 501         unsigned size;
 502
 503         /* read out the tmem handles from the data and set aside */
 504         zbpg = kmap_zbudpage_atomic(zbudpage);
 505         for (i = 0, j = 0; i < 2; i++) {
 506                 size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size;
 507                 if (size) {
 508                         th = (struct tmem_handle *)zbud_data(zbpg, i, size);
 509                         client_id[j] = th->client_id;
 510                         pool_id[j] = th->pool_id;
 511                         oid[j] = th->oid;
 512                         index[j] = th->index;
 513                         j++;
 514                         zbud_unuse_zbud(zbudpage, i, true);
 515                 }
 516         }
 517         kunmap_zbudpage_atomic(zbpg);
 518         zbudpage_spin_unlock(zbudpage);
 519         /* zbudpage is now an unlocked dying... tell tmem to flush pointers */
 520         for (i = 0; i < j; i++) {
 521                 pool = zcache_get_pool_by_id(client_id[i], pool_id[i]);
 522                 if (pool != NULL) {
 523                         tmem_flush_page(pool, &oid[i], index[i]);
 524                         zcache_put_pool(pool);
 525                 }
 526         }
 527 }
 528
 529 /*
 530  * Externally callable zbud handling routines.
 531  */
 532
 533 /*
 534  * Return the maximum size compressed page that can be stored (secretly
 535  * setting aside space for the tmem handle.
 536  */
 537 unsigned int zbud_max_buddy_size(void)
 538 {
 539         return zbud_max_size() - sizeof(struct tmem_handle);
 540 }
 541
 542 /*
 543  * Given a zbud reference, free the corresponding zbud from all lists,
 544  * mark it as unused, do accounting, and if the freeing of the zbud
 545  * frees up an entire pageframe, return it to the caller (else NULL).
 546  */
 547 struct page *zbud_free_and_delist(struct zbudref *zref, bool eph,
 548                                   unsigned int *zsize, unsigned int *zpages)
 549 {
 550         unsigned long budnum = zbudref_budnum(zref);
 551         struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 552         struct page *page = NULL;
 553         unsigned chunks, bud_size, other_bud_size;
 554         spinlock_t *lists_lock =
 555                 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 556         struct zbud_unbuddied *unbud =
 557                 eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
 558
 559
 560         spin_lock(lists_lock);
 561         zbudpage_spin_lock(zbudpage);
 562         if (zbudpage_is_dying(zbudpage)) {
 563                 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
 564                 zbudpage_spin_unlock(zbudpage);
 565                 spin_unlock(lists_lock);
 566                 *zpages = 0;
 567                 *zsize = 0;
 568                 goto out;
 569         }
 570         if (budnum == 0) {
 571                 bud_size = zbudpage->zbud0_size;
 572                 other_bud_size = zbudpage->zbud1_size;
 573         } else {
 574                 bud_size = zbudpage->zbud1_size;
 575                 other_bud_size = zbudpage->zbud0_size;
 576         }
 577         *zsize = bud_size - sizeof(struct tmem_handle);
 578         *zpages = 1;
 579         zbud_unuse_zbud(zbudpage, budnum, eph);
 580         if (other_bud_size == 0) { /* was unbuddied: unlist and free */
 581                 chunks = zbud_size_to_chunks(bud_size) ;
 582                 if (zbudpage_is_zombie(zbudpage)) {
 583                         if (eph)
 584                                 zbud_pers_zombie_count =
 585                                   atomic_dec_return(&zbud_eph_zombie_atomic);
 586                         else
 587                                 zbud_pers_zombie_count =
 588                                   atomic_dec_return(&zbud_pers_zombie_atomic);
 589                         zbudpage_clear_zombie(zbudpage);
 590                 } else {
 591                         BUG_ON(list_empty(&unbud[chunks].list));
 592                         list_del_init(&zbudpage->budlist);
 593                         unbud[chunks].count--;
 594                 }
 595                 list_del_init(&zbudpage->lru);
 596                 spin_unlock(lists_lock);
 597                 if (eph)
 598                         zbud_eph_unbuddied_count--;
 599                 else
 600                         zbud_pers_unbuddied_count--;
 601                 page = zbud_unuse_zbudpage(zbudpage, eph);
 602         } else { /* was buddied: move remaining buddy to unbuddied list */
 603                 chunks = zbud_size_to_chunks(other_bud_size) ;
 604                 if (!zbudpage_is_zombie(zbudpage)) {
 605                         list_del_init(&zbudpage->budlist);
 606                         list_add_tail(&zbudpage->budlist, &unbud[chunks].list);
 607                         unbud[chunks].count++;
 608                 }
 609                 if (eph) {
 610                         zbud_eph_buddied_count--;
 611                         zbud_eph_unbuddied_count++;
 612                 } else {
 613                         zbud_pers_unbuddied_count++;
 614                         zbud_pers_buddied_count--;
 615                 }
 616                 /* don't mess with lru, no need to move it */
 617                 zbudpage_spin_unlock(zbudpage);
 618                 spin_unlock(lists_lock);
 619         }
 620 out:
 621         return page;
 622 }
 623
 624 /*
 625  * Given a tmem handle, and a kmapped pointer to compressed data of
 626  * the given size, try to find an unbuddied zbudpage in which to
 627  * create a zbud. If found, put it there, mark the zbudpage unevictable,
 628  * and return a zbudref to it.  Else return NULL.
 629  */
 630 struct zbudref *zbud_match_prep(struct tmem_handle *th, bool eph,
 631                                 void *cdata, unsigned size)
 632 {
 633         struct zbudpage *zbudpage = NULL, *zbudpage2;
 634         unsigned long budnum = 0UL;
 635         unsigned nchunks;
 636         int i, found_good_buddy = 0;
 637         spinlock_t *lists_lock =
 638                 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 639         struct zbud_unbuddied *unbud =
 640                 eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
 641
 642         size += sizeof(struct tmem_handle);
 643         nchunks = zbud_size_to_chunks(size);
 644         for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
 645                 spin_lock(lists_lock);
 646                 if (!list_empty(&unbud[i].list)) {
 647                         list_for_each_entry_safe(zbudpage, zbudpage2,
 648                                     &unbud[i].list, budlist) {
 649                                 if (zbudpage_spin_trylock(zbudpage)) {
 650                                         found_good_buddy = i;
 651                                         goto found_unbuddied;
 652                                 }
 653                         }
 654                 }
 655                 spin_unlock(lists_lock);
 656         }
 657         zbudpage = NULL;
 658         goto out;
 659
 660 found_unbuddied:
 661         BUG_ON(!zbudpage_is_locked(zbudpage));
 662         BUG_ON(!((zbudpage->zbud0_size == 0) ^ (zbudpage->zbud1_size == 0)));
 663         if (zbudpage->zbud0_size == 0)
 664                 budnum = 0UL;
 665         else if (zbudpage->zbud1_size == 0)
 666                 budnum = 1UL;
 667         list_del_init(&zbudpage->budlist);
 668         if (eph) {
 669                 list_add_tail(&zbudpage->budlist, &zbud_eph_buddied_list);
 670                 unbud[found_good_buddy].count--;
 671                 zbud_eph_unbuddied_count--;
 672                 zbud_eph_buddied_count++;
 673                 /* "promote" raw zbudpage to most-recently-used */
 674                 list_del_init(&zbudpage->lru);
 675                 list_add_tail(&zbudpage->lru, &zbud_eph_lru_list);
 676         } else {
 677                 list_add_tail(&zbudpage->budlist, &zbud_pers_buddied_list);
 678                 unbud[found_good_buddy].count--;
 679                 zbud_pers_unbuddied_count--;
 680                 zbud_pers_buddied_count++;
 681                 /* "promote" raw zbudpage to most-recently-used */
 682                 list_del_init(&zbudpage->lru);
 683                 list_add_tail(&zbudpage->lru, &zbud_pers_lru_list);
 684         }
 685         zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size);
 686         zbudpage->unevictable++;
 687         BUG_ON(zbudpage->unevictable == 3);
 688         zbudpage_spin_unlock(zbudpage);
 689         spin_unlock(lists_lock);
 690 out:
 691         return zbudpage_to_zbudref(zbudpage, budnum);
 692
 693 }
 694
 695 /*
 696  * Given a tmem handle, and a kmapped pointer to compressed data of
 697  * the given size, and a newly allocated struct page, create an unevictable
 698  * zbud in that new page and return a zbudref to it.
 699  */
 700 struct zbudref *zbud_create_prep(struct tmem_handle *th, bool eph,
 701                                         void *cdata, unsigned size,
 702                                         struct page *newpage)
 703 {
 704         struct zbudpage *zbudpage;
 705         unsigned long budnum = 0;
 706         unsigned nchunks;
 707         spinlock_t *lists_lock =
 708                 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 709         struct zbud_unbuddied *unbud =
 710                 eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
 711
 712 #if 0
 713         /* this may be worth it later to support decompress-in-place? */
 714         static unsigned long counter;
 715         budnum = counter++ & 1; /* alternate using zbud0 and zbud1 */
 716 #endif
 717
 718         if (size  > zbud_max_buddy_size())
 719                 return NULL;
 720         if (newpage == NULL)
 721                 return NULL;
 722
 723         size += sizeof(struct tmem_handle);
 724         nchunks = zbud_size_to_chunks(size) ;
 725         spin_lock(lists_lock);
 726         zbudpage = zbud_init_zbudpage(newpage, eph);
 727         zbudpage_spin_lock(zbudpage);
 728         list_add_tail(&zbudpage->budlist, &unbud[nchunks].list);
 729         if (eph) {
 730                 list_add_tail(&zbudpage->lru, &zbud_eph_lru_list);
 731                 zbud_eph_unbuddied_count++;
 732         } else {
 733                 list_add_tail(&zbudpage->lru, &zbud_pers_lru_list);
 734                 zbud_pers_unbuddied_count++;
 735         }
 736         unbud[nchunks].count++;
 737         zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size);
 738         zbudpage->unevictable++;
 739         BUG_ON(zbudpage->unevictable == 3);
 740         zbudpage_spin_unlock(zbudpage);
 741         spin_unlock(lists_lock);
 742         return zbudpage_to_zbudref(zbudpage, budnum);
 743 }
 744
 745 /*
 746  * Finish creation of a zbud by, assuming another zbud isn't being created
 747  * in parallel, marking it evictable.
 748  */
 749 void zbud_create_finish(struct zbudref *zref, bool eph)
 750 {
 751         struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 752         spinlock_t *lists_lock =
 753                 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 754
 755         spin_lock(lists_lock);
 756         zbudpage_spin_lock(zbudpage);
 757         BUG_ON(zbudpage_is_dying(zbudpage));
 758         zbudpage->unevictable--;
 759         BUG_ON((int)zbudpage->unevictable < 0);
 760         zbudpage_spin_unlock(zbudpage);
 761         spin_unlock(lists_lock);
 762 }
 763
 764 /*
 765  * Given a zbudref and a struct page, decompress the data from
 766  * the zbud into the physical page represented by the struct page
 767  * by upcalling to zcache_decompress
 768  */
 769 int zbud_decompress(struct page *data_page, struct zbudref *zref, bool eph,
 770                         void (*decompress)(char *, unsigned int, char *))
 771 {
 772         struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 773         unsigned long budnum = zbudref_budnum(zref);
 774         void *zbpg;
 775         char *to_va, *from_va;
 776         unsigned size;
 777         int ret = -1;
 778         spinlock_t *lists_lock =
 779                 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 780
 781         spin_lock(lists_lock);
 782         zbudpage_spin_lock(zbudpage);
 783         if (zbudpage_is_dying(zbudpage)) {
 784                 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
 785                 goto out;
 786         }
 787         zbpg = kmap_zbudpage_atomic(zbudpage);
 788         to_va = kmap_atomic(data_page);
 789         if (budnum == 0)
 790                 size = zbudpage->zbud0_size;
 791         else
 792                 size = zbudpage->zbud1_size;
 793         BUG_ON(size == 0 || size > zbud_max_size());
 794         from_va = zbud_data(zbpg, budnum, size);
 795         from_va += sizeof(struct tmem_handle);
 796         size -= sizeof(struct tmem_handle);
 797         decompress(from_va, size, to_va);
 798         kunmap_atomic(to_va);
 799         kunmap_zbudpage_atomic(zbpg);
 800         ret = 0;
 801 out:
 802         zbudpage_spin_unlock(zbudpage);
 803         spin_unlock(lists_lock);
 804         return ret;
 805 }
 806
 807 /*
 808  * Given a zbudref and a kernel pointer, copy the data from
 809  * the zbud to the kernel pointer.
 810  */
 811 int zbud_copy_from_zbud(char *to_va, struct zbudref *zref,
 812                                 size_t *sizep, bool eph)
 813 {
 814         struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 815         unsigned long budnum = zbudref_budnum(zref);
 816         void *zbpg;
 817         char *from_va;
 818         unsigned size;
 819         int ret = -1;
 820         spinlock_t *lists_lock =
 821                 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 822
 823         spin_lock(lists_lock);
 824         zbudpage_spin_lock(zbudpage);
 825         if (zbudpage_is_dying(zbudpage)) {
 826                 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
 827                 goto out;
 828         }
 829         zbpg = kmap_zbudpage_atomic(zbudpage);
 830         if (budnum == 0)
 831                 size = zbudpage->zbud0_size;
 832         else
 833                 size = zbudpage->zbud1_size;
 834         BUG_ON(size == 0 || size > zbud_max_size());
 835         from_va = zbud_data(zbpg, budnum, size);
 836         from_va += sizeof(struct tmem_handle);
 837         size -= sizeof(struct tmem_handle);
 838         *sizep = size;
 839         memcpy(to_va, from_va, size);
 840
 841         kunmap_zbudpage_atomic(zbpg);
 842         ret = 0;
 843 out:
 844         zbudpage_spin_unlock(zbudpage);
 845         spin_unlock(lists_lock);
 846         return ret;
 847 }
 848
 849 /*
 850  * Given a zbudref and a kernel pointer, copy the data from
 851  * the kernel pointer to the zbud.
 852  */
 853 int zbud_copy_to_zbud(struct zbudref *zref, char *from_va, bool eph)
 854 {
 855         struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 856         unsigned long budnum = zbudref_budnum(zref);
 857         void *zbpg;
 858         char *to_va;
 859         unsigned size;
 860         int ret = -1;
 861         spinlock_t *lists_lock =
 862                 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 863
 864         spin_lock(lists_lock);
 865         zbudpage_spin_lock(zbudpage);
 866         if (zbudpage_is_dying(zbudpage)) {
 867                 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
 868                 goto out;
 869         }
 870         zbpg = kmap_zbudpage_atomic(zbudpage);
 871         if (budnum == 0)
 872                 size = zbudpage->zbud0_size;
 873         else
 874                 size = zbudpage->zbud1_size;
 875         BUG_ON(size == 0 || size > zbud_max_size());
 876         to_va = zbud_data(zbpg, budnum, size);
 877         to_va += sizeof(struct tmem_handle);
 878         size -= sizeof(struct tmem_handle);
 879         memcpy(to_va, from_va, size);
 880
 881         kunmap_zbudpage_atomic(zbpg);
 882         ret = 0;
 883 out:
 884         zbudpage_spin_unlock(zbudpage);
 885         spin_unlock(lists_lock);
 886         return ret;
 887 }
 888
 889 /*
 890  * Choose an ephemeral LRU zbudpage that is evictable (not locked), ensure
 891  * there are no references to it remaining, and return the now unused
 892  * (and re-init'ed) struct page and the total amount of compressed
 893  * data that was evicted.
 894  */
 895 struct page *zbud_evict_pageframe_lru(unsigned int *zsize, unsigned int *zpages)
 896 {
 897         struct zbudpage *zbudpage = NULL, *zbudpage2;
 898         struct zbud_unbuddied *unbud = zbud_eph_unbuddied;
 899         struct page *page = NULL;
 900         bool irqs_disabled = irqs_disabled();
 901
 902         /*
 903          * Since this can be called indirectly from cleancache_put, which
 904          * has interrupts disabled, as well as frontswap_put, which does not,
 905          * we need to be able to handle both cases, even though it is ugly.
 906          */
 907         if (irqs_disabled)
 908                 spin_lock(&zbud_eph_lists_lock);
 909         else
 910                 spin_lock_bh(&zbud_eph_lists_lock);
 911         *zsize = 0;
 912         if (list_empty(&zbud_eph_lru_list))
 913                 goto unlock_out;
 914         list_for_each_entry_safe(zbudpage, zbudpage2, &zbud_eph_lru_list, lru) {
 915                 /* skip a locked zbudpage */
 916                 if (unlikely(!zbudpage_spin_trylock(zbudpage)))
 917                         continue;
 918                 /* skip an unevictable zbudpage */
 919                 if (unlikely(zbudpage->unevictable != 0)) {
 920                         zbudpage_spin_unlock(zbudpage);
 921                         continue;
 922                 }
 923                 /* got a locked evictable page */
 924                 goto evict_page;
 925
 926         }
 927 unlock_out:
 928         /* no unlocked evictable pages, give up */
 929         if (irqs_disabled)
 930                 spin_unlock(&zbud_eph_lists_lock);
 931         else
 932                 spin_unlock_bh(&zbud_eph_lists_lock);
 933         goto out;
 934
 935 evict_page:
 936         list_del_init(&zbudpage->budlist);
 937         list_del_init(&zbudpage->lru);
 938         zbudpage_set_dying(zbudpage);
 939         /*
 940          * the zbudpage is now "dying" and attempts to read, write,
 941          * or delete data from it will be ignored
 942          */
 943         if (zbudpage->zbud0_size != 0 && zbudpage->zbud1_size !=  0) {
 944                 *zsize = zbudpage->zbud0_size + zbudpage->zbud1_size -
 945                                 (2 * sizeof(struct tmem_handle));
 946                 *zpages = 2;
 947         } else if (zbudpage->zbud0_size != 0) {
 948                 unbud[zbud_size_to_chunks(zbudpage->zbud0_size)].count--;
 949                 *zsize = zbudpage->zbud0_size - sizeof(struct tmem_handle);
 950                 *zpages = 1;
 951         } else if (zbudpage->zbud1_size != 0) {
 952                 unbud[zbud_size_to_chunks(zbudpage->zbud1_size)].count--;
 953                 *zsize = zbudpage->zbud1_size - sizeof(struct tmem_handle);
 954                 *zpages = 1;
 955         } else {
 956                 BUG();
 957         }
 958         spin_unlock(&zbud_eph_lists_lock);
 959         zbud_eph_evicted_pageframes++;
 960         if (*zpages == 1)
 961                 zbud_eph_unbuddied_count--;
 962         else
 963                 zbud_eph_buddied_count--;
 964         zbud_evict_tmem(zbudpage);
 965         zbudpage_spin_lock(zbudpage);
 966         zbudpage_clear_dying(zbudpage);
 967         page = zbud_unuse_zbudpage(zbudpage, true);
 968         if (!irqs_disabled)
 969                 local_bh_enable();
 970 out:
 971         return page;
 972 }
 973
 974 /*
 975  * Choose a persistent LRU zbudpage that is evictable (not locked), zombify it,
 976  * read the tmem_handle(s) out of it into the passed array, and return the
 977  * number of zbuds.  Caller must perform necessary tmem functions and,
 978  * indirectly, zbud functions to fetch any valid data and cause the
 979  * now-zombified zbudpage to eventually be freed.  We track the zombified
 980  * zbudpage count so it is possible to observe if there is a leak.
 981  FIXME: describe (ramster) case where data pointers are passed in for memcpy
 982  */
 983 unsigned int zbud_make_zombie_lru(struct tmem_handle *th, unsigned char **data,
 984                                         unsigned int *zsize, bool eph)
 985 {
 986         struct zbudpage *zbudpage = NULL, *zbudpag2;
 987         struct tmem_handle *thfrom;
 988         char *from_va;
 989         void *zbpg;
 990         unsigned size;
 991         int ret = 0, i;
 992         spinlock_t *lists_lock =
 993                 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 994         struct list_head *lru_list =
 995                 eph ? &zbud_eph_lru_list : &zbud_pers_lru_list;
 996
 997         spin_lock_bh(lists_lock);
 998         if (list_empty(lru_list))
 999                 goto out;
1000         list_for_each_entry_safe(zbudpage, zbudpag2, lru_list, lru) {
1001                 /* skip a locked zbudpage */
1002                 if (unlikely(!zbudpage_spin_trylock(zbudpage)))
1003                         continue;
1004                 /* skip an unevictable zbudpage */
1005                 if (unlikely(zbudpage->unevictable != 0)) {
1006                         zbudpage_spin_unlock(zbudpage);
1007                         continue;
1008                 }
1009                 /* got a locked evictable page */
1010                 goto zombify_page;
1011         }
1012         /* no unlocked evictable pages, give up */
1013         goto out;
1014
1015 zombify_page:
1016         /* got an unlocked evictable page, zombify it */
1017         list_del_init(&zbudpage->budlist);
1018         zbudpage_set_zombie(zbudpage);
1019         /* FIXME what accounting do I need to do here? */
1020         list_del_init(&zbudpage->lru);
1021         if (eph) {
1022                 list_add_tail(&zbudpage->lru, &zbud_eph_zombie_list);
1023                 zbud_eph_zombie_count =
1024                                 atomic_inc_return(&zbud_eph_zombie_atomic);
1025         } else {
1026                 list_add_tail(&zbudpage->lru, &zbud_pers_zombie_list);
1027                 zbud_pers_zombie_count =
1028                                 atomic_inc_return(&zbud_pers_zombie_atomic);
1029         }
1030         /* FIXME what accounting do I need to do here? */
1031         zbpg = kmap_zbudpage_atomic(zbudpage);
1032         for (i = 0; i < 2; i++) {
1033                 size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size;
1034                 if (size) {
1035                         from_va = zbud_data(zbpg, i, size);
1036                         thfrom = (struct tmem_handle *)from_va;
1037                         from_va += sizeof(struct tmem_handle);
1038                         size -= sizeof(struct tmem_handle);
1039                         if (th != NULL)
1040                                 th[ret] = *thfrom;
1041                         if (data != NULL)
1042                                 memcpy(data[ret], from_va, size);
1043                         if (zsize != NULL)
1044                                 *zsize++ = size;
1045                         ret++;
1046                 }
1047         }
1048         kunmap_zbudpage_atomic(zbpg);
1049         zbudpage_spin_unlock(zbudpage);
1050 out:
1051         spin_unlock_bh(lists_lock);
1052         return ret;
1053 }
1054
1055 void zbud_init(void)
1056 {
1057         int i;
1058
1059         zbud_debugfs_init();
1060         BUG_ON((sizeof(struct tmem_handle) * 2 > CHUNK_SIZE));
1061         BUG_ON(sizeof(struct zbudpage) > sizeof(struct page));
1062         for (i = 0; i < NCHUNKS; i++) {
1063                 INIT_LIST_HEAD(&zbud_eph_unbuddied[i].list);
1064                 INIT_LIST_HEAD(&zbud_pers_unbuddied[i].list);
1065         }
1066 }