Pileus Git - ~andy/linux/blob - drivers/staging/zcache/zcache-main.c

   1 /*
   2  * zcache.c
   3  *
   4  * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.
   5  * Copyright (c) 2010,2011, Nitin Gupta
   6  *
   7  * Zcache provides an in-kernel "host implementation" for transcendent memory
   8  * and, thus indirectly, for cleancache and frontswap.  Zcache includes two
   9  * page-accessible memory [1] interfaces, both utilizing the crypto compression
  10  * API:
  11  * 1) "compression buddies" ("zbud") is used for ephemeral pages
  12  * 2) zsmalloc is used for persistent pages.
  13  * Xvmalloc (based on the TLSF allocator) has very low fragmentation
  14  * so maximizes space efficiency, while zbud allows pairs (and potentially,
  15  * in the future, more than a pair of) compressed pages to be closely linked
  16  * so that reclaiming can be done via the kernel's physical-page-oriented
  17  * "shrinker" interface.
  18  *
  19  * [1] For a definition of page-accessible memory (aka PAM), see:
  20  *   http://marc.info/?l=linux-mm&m=127811271605009
  21  */
  22
  23 #include <linux/module.h>
  24 #include <linux/cpu.h>
  25 #include <linux/highmem.h>
  26 #include <linux/list.h>
  27 #include <linux/slab.h>
  28 #include <linux/spinlock.h>
  29 #include <linux/types.h>
  30 #include <linux/atomic.h>
  31 #include <linux/math64.h>
  32 #include <linux/crypto.h>
  33 #include <linux/string.h>
  34 #include <linux/idr.h>
  35 #include "tmem.h"
  36
  37 #include "../zsmalloc/zsmalloc.h"
  38
  39 #if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
  40 #error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
  41 #endif
  42 #ifdef CONFIG_CLEANCACHE
  43 #include <linux/cleancache.h>
  44 #endif
  45 #ifdef CONFIG_FRONTSWAP
  46 #include <linux/frontswap.h>
  47 #endif
  48
  49 #if 0
  50 /* this is more aggressive but may cause other problems? */
  51 #define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
  52 #else
  53 #define ZCACHE_GFP_MASK \
  54         (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
  55 #endif
  56
  57 #define MAX_CLIENTS 16
  58 #define LOCAL_CLIENT ((uint16_t)-1)
  59
  60 MODULE_LICENSE("GPL");
  61
  62 struct zcache_client {
  63         struct idr tmem_pools;
  64         struct zs_pool *zspool;
  65         bool allocated;
  66         atomic_t refcount;
  67 };
  68
  69 static struct zcache_client zcache_host;
  70 static struct zcache_client zcache_clients[MAX_CLIENTS];
  71
  72 static inline uint16_t get_client_id_from_client(struct zcache_client *cli)
  73 {
  74         BUG_ON(cli == NULL);
  75         if (cli == &zcache_host)
  76                 return LOCAL_CLIENT;
  77         return cli - &zcache_clients[0];
  78 }
  79
  80 static inline bool is_local_client(struct zcache_client *cli)
  81 {
  82         return cli == &zcache_host;
  83 }
  84
  85 /* crypto API for zcache  */
  86 #define ZCACHE_COMP_NAME_SZ CRYPTO_MAX_ALG_NAME
  87 static char zcache_comp_name[ZCACHE_COMP_NAME_SZ];
  88 static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms;
  89
  90 enum comp_op {
  91         ZCACHE_COMPOP_COMPRESS,
  92         ZCACHE_COMPOP_DECOMPRESS
  93 };
  94
  95 static inline int zcache_comp_op(enum comp_op op,
  96                                 const u8 *src, unsigned int slen,
  97                                 u8 *dst, unsigned int *dlen)
  98 {
  99         struct crypto_comp *tfm;
 100         int ret;
 101
 102         BUG_ON(!zcache_comp_pcpu_tfms);
 103         tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, get_cpu());
 104         BUG_ON(!tfm);
 105         switch (op) {
 106         case ZCACHE_COMPOP_COMPRESS:
 107                 ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
 108                 break;
 109         case ZCACHE_COMPOP_DECOMPRESS:
 110                 ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
 111                 break;
 112         }
 113         put_cpu();
 114         return ret;
 115 }
 116
 117 /**********
 118  * Compression buddies ("zbud") provides for packing two (or, possibly
 119  * in the future, more) compressed ephemeral pages into a single "raw"
 120  * (physical) page and tracking them with data structures so that
 121  * the raw pages can be easily reclaimed.
 122  *
 123  * A zbud page ("zbpg") is an aligned page containing a list_head,
 124  * a lock, and two "zbud headers".  The remainder of the physical
 125  * page is divided up into aligned 64-byte "chunks" which contain
 126  * the compressed data for zero, one, or two zbuds.  Each zbpg
 127  * resides on: (1) an "unused list" if it has no zbuds; (2) a
 128  * "buddied" list if it is fully populated  with two zbuds; or
 129  * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
 130  * the one unbuddied zbud uses.  The data inside a zbpg cannot be
 131  * read or written unless the zbpg's lock is held.
 132  */
 133
 134 #define ZBH_SENTINEL  0x43214321
 135 #define ZBPG_SENTINEL  0xdeadbeef
 136
 137 #define ZBUD_MAX_BUDS 2
 138
 139 struct zbud_hdr {
 140         uint16_t client_id;
 141         uint16_t pool_id;
 142         struct tmem_oid oid;
 143         uint32_t index;
 144         uint16_t size; /* compressed size in bytes, zero means unused */
 145         DECL_SENTINEL
 146 };
 147
 148 struct zbud_page {
 149         struct list_head bud_list;
 150         spinlock_t lock;
 151         struct zbud_hdr buddy[ZBUD_MAX_BUDS];
 152         DECL_SENTINEL
 153         /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
 154 };
 155
 156 #define CHUNK_SHIFT     6
 157 #define CHUNK_SIZE      (1 << CHUNK_SHIFT)
 158 #define CHUNK_MASK      (~(CHUNK_SIZE-1))
 159 #define NCHUNKS         (((PAGE_SIZE - sizeof(struct zbud_page)) & \
 160                                 CHUNK_MASK) >> CHUNK_SHIFT)
 161 #define MAX_CHUNK       (NCHUNKS-1)
 162
 163 static struct {
 164         struct list_head list;
 165         unsigned count;
 166 } zbud_unbuddied[NCHUNKS];
 167 /* list N contains pages with N chunks USED and NCHUNKS-N unused */
 168 /* element 0 is never used but optimizing that isn't worth it */
 169 static unsigned long zbud_cumul_chunk_counts[NCHUNKS];
 170
 171 struct list_head zbud_buddied_list;
 172 static unsigned long zcache_zbud_buddied_count;
 173
 174 /* protects the buddied list and all unbuddied lists */
 175 static DEFINE_SPINLOCK(zbud_budlists_spinlock);
 176
 177 static LIST_HEAD(zbpg_unused_list);
 178 static unsigned long zcache_zbpg_unused_list_count;
 179
 180 /* protects the unused page list */
 181 static DEFINE_SPINLOCK(zbpg_unused_list_spinlock);
 182
 183 static atomic_t zcache_zbud_curr_raw_pages;
 184 static atomic_t zcache_zbud_curr_zpages;
 185 static unsigned long zcache_zbud_curr_zbytes;
 186 static unsigned long zcache_zbud_cumul_zpages;
 187 static unsigned long zcache_zbud_cumul_zbytes;
 188 static unsigned long zcache_compress_poor;
 189 static unsigned long zcache_mean_compress_poor;
 190
 191 /* forward references */
 192 static void *zcache_get_free_page(void);
 193 static void zcache_free_page(void *p);
 194
 195 /*
 196  * zbud helper functions
 197  */
 198
 199 static inline unsigned zbud_max_buddy_size(void)
 200 {
 201         return MAX_CHUNK << CHUNK_SHIFT;
 202 }
 203
 204 static inline unsigned zbud_size_to_chunks(unsigned size)
 205 {
 206         BUG_ON(size == 0 || size > zbud_max_buddy_size());
 207         return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
 208 }
 209
 210 static inline int zbud_budnum(struct zbud_hdr *zh)
 211 {
 212         unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1);
 213         struct zbud_page *zbpg = NULL;
 214         unsigned budnum = -1U;
 215         int i;
 216
 217         for (i = 0; i < ZBUD_MAX_BUDS; i++)
 218                 if (offset == offsetof(typeof(*zbpg), buddy[i])) {
 219                         budnum = i;
 220                         break;
 221                 }
 222         BUG_ON(budnum == -1U);
 223         return budnum;
 224 }
 225
 226 static char *zbud_data(struct zbud_hdr *zh, unsigned size)
 227 {
 228         struct zbud_page *zbpg;
 229         char *p;
 230         unsigned budnum;
 231
 232         ASSERT_SENTINEL(zh, ZBH);
 233         budnum = zbud_budnum(zh);
 234         BUG_ON(size == 0 || size > zbud_max_buddy_size());
 235         zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
 236         ASSERT_SPINLOCK(&zbpg->lock);
 237         p = (char *)zbpg;
 238         if (budnum == 0)
 239                 p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) &
 240                                                         CHUNK_MASK);
 241         else if (budnum == 1)
 242                 p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
 243         return p;
 244 }
 245
 246 /*
 247  * zbud raw page management
 248  */
 249
 250 static struct zbud_page *zbud_alloc_raw_page(void)
 251 {
 252         struct zbud_page *zbpg = NULL;
 253         struct zbud_hdr *zh0, *zh1;
 254         bool recycled = 0;
 255
 256         /* if any pages on the zbpg list, use one */
 257         spin_lock(&zbpg_unused_list_spinlock);
 258         if (!list_empty(&zbpg_unused_list)) {
 259                 zbpg = list_first_entry(&zbpg_unused_list,
 260                                 struct zbud_page, bud_list);
 261                 list_del_init(&zbpg->bud_list);
 262                 zcache_zbpg_unused_list_count--;
 263                 recycled = 1;
 264         }
 265         spin_unlock(&zbpg_unused_list_spinlock);
 266         if (zbpg == NULL)
 267                 /* none on zbpg list, try to get a kernel page */
 268                 zbpg = zcache_get_free_page();
 269         if (likely(zbpg != NULL)) {
 270                 INIT_LIST_HEAD(&zbpg->bud_list);
 271                 zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
 272                 spin_lock_init(&zbpg->lock);
 273                 if (recycled) {
 274                         ASSERT_INVERTED_SENTINEL(zbpg, ZBPG);
 275                         SET_SENTINEL(zbpg, ZBPG);
 276                         BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
 277                         BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
 278                 } else {
 279                         atomic_inc(&zcache_zbud_curr_raw_pages);
 280                         INIT_LIST_HEAD(&zbpg->bud_list);
 281                         SET_SENTINEL(zbpg, ZBPG);
 282                         zh0->size = 0; zh1->size = 0;
 283                         tmem_oid_set_invalid(&zh0->oid);
 284                         tmem_oid_set_invalid(&zh1->oid);
 285                 }
 286         }
 287         return zbpg;
 288 }
 289
 290 static void zbud_free_raw_page(struct zbud_page *zbpg)
 291 {
 292         struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1];
 293
 294         ASSERT_SENTINEL(zbpg, ZBPG);
 295         BUG_ON(!list_empty(&zbpg->bud_list));
 296         ASSERT_SPINLOCK(&zbpg->lock);
 297         BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
 298         BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
 299         INVERT_SENTINEL(zbpg, ZBPG);
 300         spin_unlock(&zbpg->lock);
 301         spin_lock(&zbpg_unused_list_spinlock);
 302         list_add(&zbpg->bud_list, &zbpg_unused_list);
 303         zcache_zbpg_unused_list_count++;
 304         spin_unlock(&zbpg_unused_list_spinlock);
 305 }
 306
 307 /*
 308  * core zbud handling routines
 309  */
 310
 311 static unsigned zbud_free(struct zbud_hdr *zh)
 312 {
 313         unsigned size;
 314
 315         ASSERT_SENTINEL(zh, ZBH);
 316         BUG_ON(!tmem_oid_valid(&zh->oid));
 317         size = zh->size;
 318         BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
 319         zh->size = 0;
 320         tmem_oid_set_invalid(&zh->oid);
 321         INVERT_SENTINEL(zh, ZBH);
 322         zcache_zbud_curr_zbytes -= size;
 323         atomic_dec(&zcache_zbud_curr_zpages);
 324         return size;
 325 }
 326
 327 static void zbud_free_and_delist(struct zbud_hdr *zh)
 328 {
 329         unsigned chunks;
 330         struct zbud_hdr *zh_other;
 331         unsigned budnum = zbud_budnum(zh), size;
 332         struct zbud_page *zbpg =
 333                 container_of(zh, struct zbud_page, buddy[budnum]);
 334
 335         spin_lock(&zbud_budlists_spinlock);
 336         spin_lock(&zbpg->lock);
 337         if (list_empty(&zbpg->bud_list)) {
 338                 /* ignore zombie page... see zbud_evict_pages() */
 339                 spin_unlock(&zbpg->lock);
 340                 spin_unlock(&zbud_budlists_spinlock);
 341                 return;
 342         }
 343         size = zbud_free(zh);
 344         ASSERT_SPINLOCK(&zbpg->lock);
 345         zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0];
 346         if (zh_other->size == 0) { /* was unbuddied: unlist and free */
 347                 chunks = zbud_size_to_chunks(size) ;
 348                 BUG_ON(list_empty(&zbud_unbuddied[chunks].list));
 349                 list_del_init(&zbpg->bud_list);
 350                 zbud_unbuddied[chunks].count--;
 351                 spin_unlock(&zbud_budlists_spinlock);
 352                 zbud_free_raw_page(zbpg);
 353         } else { /* was buddied: move remaining buddy to unbuddied list */
 354                 chunks = zbud_size_to_chunks(zh_other->size) ;
 355                 list_del_init(&zbpg->bud_list);
 356                 zcache_zbud_buddied_count--;
 357                 list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list);
 358                 zbud_unbuddied[chunks].count++;
 359                 spin_unlock(&zbud_budlists_spinlock);
 360                 spin_unlock(&zbpg->lock);
 361         }
 362 }
 363
 364 static struct zbud_hdr *zbud_create(uint16_t client_id, uint16_t pool_id,
 365                                         struct tmem_oid *oid,
 366                                         uint32_t index, struct page *page,
 367                                         void *cdata, unsigned size)
 368 {
 369         struct zbud_hdr *zh0, *zh1, *zh = NULL;
 370         struct zbud_page *zbpg = NULL, *ztmp;
 371         unsigned nchunks;
 372         char *to;
 373         int i, found_good_buddy = 0;
 374
 375         nchunks = zbud_size_to_chunks(size) ;
 376         for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
 377                 spin_lock(&zbud_budlists_spinlock);
 378                 if (!list_empty(&zbud_unbuddied[i].list)) {
 379                         list_for_each_entry_safe(zbpg, ztmp,
 380                                     &zbud_unbuddied[i].list, bud_list) {
 381                                 if (spin_trylock(&zbpg->lock)) {
 382                                         found_good_buddy = i;
 383                                         goto found_unbuddied;
 384                                 }
 385                         }
 386                 }
 387                 spin_unlock(&zbud_budlists_spinlock);
 388         }
 389         /* didn't find a good buddy, try allocating a new page */
 390         zbpg = zbud_alloc_raw_page();
 391         if (unlikely(zbpg == NULL))
 392                 goto out;
 393         /* ok, have a page, now compress the data before taking locks */
 394         spin_lock(&zbud_budlists_spinlock);
 395         spin_lock(&zbpg->lock);
 396         list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list);
 397         zbud_unbuddied[nchunks].count++;
 398         zh = &zbpg->buddy[0];
 399         goto init_zh;
 400
 401 found_unbuddied:
 402         ASSERT_SPINLOCK(&zbpg->lock);
 403         zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
 404         BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0)));
 405         if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */
 406                 ASSERT_SENTINEL(zh0, ZBH);
 407                 zh = zh1;
 408         } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */
 409                 ASSERT_SENTINEL(zh1, ZBH);
 410                 zh = zh0;
 411         } else
 412                 BUG();
 413         list_del_init(&zbpg->bud_list);
 414         zbud_unbuddied[found_good_buddy].count--;
 415         list_add_tail(&zbpg->bud_list, &zbud_buddied_list);
 416         zcache_zbud_buddied_count++;
 417
 418 init_zh:
 419         SET_SENTINEL(zh, ZBH);
 420         zh->size = size;
 421         zh->index = index;
 422         zh->oid = *oid;
 423         zh->pool_id = pool_id;
 424         zh->client_id = client_id;
 425         to = zbud_data(zh, size);
 426         memcpy(to, cdata, size);
 427         spin_unlock(&zbpg->lock);
 428         spin_unlock(&zbud_budlists_spinlock);
 429
 430         zbud_cumul_chunk_counts[nchunks]++;
 431         atomic_inc(&zcache_zbud_curr_zpages);
 432         zcache_zbud_cumul_zpages++;
 433         zcache_zbud_curr_zbytes += size;
 434         zcache_zbud_cumul_zbytes += size;
 435 out:
 436         return zh;
 437 }
 438
 439 static int zbud_decompress(struct page *page, struct zbud_hdr *zh)
 440 {
 441         struct zbud_page *zbpg;
 442         unsigned budnum = zbud_budnum(zh);
 443         unsigned int out_len = PAGE_SIZE;
 444         char *to_va, *from_va;
 445         unsigned size;
 446         int ret = 0;
 447
 448         zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
 449         spin_lock(&zbpg->lock);
 450         if (list_empty(&zbpg->bud_list)) {
 451                 /* ignore zombie page... see zbud_evict_pages() */
 452                 ret = -EINVAL;
 453                 goto out;
 454         }
 455         ASSERT_SENTINEL(zh, ZBH);
 456         BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
 457         to_va = kmap_atomic(page);
 458         size = zh->size;
 459         from_va = zbud_data(zh, size);
 460         ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, from_va, size,
 461                                 to_va, &out_len);
 462         BUG_ON(ret);
 463         BUG_ON(out_len != PAGE_SIZE);
 464         kunmap_atomic(to_va);
 465 out:
 466         spin_unlock(&zbpg->lock);
 467         return ret;
 468 }
 469
 470 /*
 471  * The following routines handle shrinking of ephemeral pages by evicting
 472  * pages "least valuable" first.
 473  */
 474
 475 static unsigned long zcache_evicted_raw_pages;
 476 static unsigned long zcache_evicted_buddied_pages;
 477 static unsigned long zcache_evicted_unbuddied_pages;
 478
 479 static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id,
 480                                                 uint16_t poolid);
 481 static void zcache_put_pool(struct tmem_pool *pool);
 482
 483 /*
 484  * Flush and free all zbuds in a zbpg, then free the pageframe
 485  */
 486 static void zbud_evict_zbpg(struct zbud_page *zbpg)
 487 {
 488         struct zbud_hdr *zh;
 489         int i, j;
 490         uint32_t pool_id[ZBUD_MAX_BUDS], client_id[ZBUD_MAX_BUDS];
 491         uint32_t index[ZBUD_MAX_BUDS];
 492         struct tmem_oid oid[ZBUD_MAX_BUDS];
 493         struct tmem_pool *pool;
 494
 495         ASSERT_SPINLOCK(&zbpg->lock);
 496         BUG_ON(!list_empty(&zbpg->bud_list));
 497         for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) {
 498                 zh = &zbpg->buddy[i];
 499                 if (zh->size) {
 500                         client_id[j] = zh->client_id;
 501                         pool_id[j] = zh->pool_id;
 502                         oid[j] = zh->oid;
 503                         index[j] = zh->index;
 504                         j++;
 505                         zbud_free(zh);
 506                 }
 507         }
 508         spin_unlock(&zbpg->lock);
 509         for (i = 0; i < j; i++) {
 510                 pool = zcache_get_pool_by_id(client_id[i], pool_id[i]);
 511                 if (pool != NULL) {
 512                         tmem_flush_page(pool, &oid[i], index[i]);
 513                         zcache_put_pool(pool);
 514                 }
 515         }
 516         ASSERT_SENTINEL(zbpg, ZBPG);
 517         spin_lock(&zbpg->lock);
 518         zbud_free_raw_page(zbpg);
 519 }
 520
 521 /*
 522  * Free nr pages.  This code is funky because we want to hold the locks
 523  * protecting various lists for as short a time as possible, and in some
 524  * circumstances the list may change asynchronously when the list lock is
 525  * not held.  In some cases we also trylock not only to avoid waiting on a
 526  * page in use by another cpu, but also to avoid potential deadlock due to
 527  * lock inversion.
 528  */
 529 static void zbud_evict_pages(int nr)
 530 {
 531         struct zbud_page *zbpg;
 532         int i;
 533
 534         /* first try freeing any pages on unused list */
 535 retry_unused_list:
 536         spin_lock_bh(&zbpg_unused_list_spinlock);
 537         if (!list_empty(&zbpg_unused_list)) {
 538                 /* can't walk list here, since it may change when unlocked */
 539                 zbpg = list_first_entry(&zbpg_unused_list,
 540                                 struct zbud_page, bud_list);
 541                 list_del_init(&zbpg->bud_list);
 542                 zcache_zbpg_unused_list_count--;
 543                 atomic_dec(&zcache_zbud_curr_raw_pages);
 544                 spin_unlock_bh(&zbpg_unused_list_spinlock);
 545                 zcache_free_page(zbpg);
 546                 zcache_evicted_raw_pages++;
 547                 if (--nr <= 0)
 548                         goto out;
 549                 goto retry_unused_list;
 550         }
 551         spin_unlock_bh(&zbpg_unused_list_spinlock);
 552
 553         /* now try freeing unbuddied pages, starting with least space avail */
 554         for (i = 0; i < MAX_CHUNK; i++) {
 555 retry_unbud_list_i:
 556                 spin_lock_bh(&zbud_budlists_spinlock);
 557                 if (list_empty(&zbud_unbuddied[i].list)) {
 558                         spin_unlock_bh(&zbud_budlists_spinlock);
 559                         continue;
 560                 }
 561                 list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) {
 562                         if (unlikely(!spin_trylock(&zbpg->lock)))
 563                                 continue;
 564                         list_del_init(&zbpg->bud_list);
 565                         zbud_unbuddied[i].count--;
 566                         spin_unlock(&zbud_budlists_spinlock);
 567                         zcache_evicted_unbuddied_pages++;
 568                         /* want budlists unlocked when doing zbpg eviction */
 569                         zbud_evict_zbpg(zbpg);
 570                         local_bh_enable();
 571                         if (--nr <= 0)
 572                                 goto out;
 573                         goto retry_unbud_list_i;
 574                 }
 575                 spin_unlock_bh(&zbud_budlists_spinlock);
 576         }
 577
 578         /* as a last resort, free buddied pages */
 579 retry_bud_list:
 580         spin_lock_bh(&zbud_budlists_spinlock);
 581         if (list_empty(&zbud_buddied_list)) {
 582                 spin_unlock_bh(&zbud_budlists_spinlock);
 583                 goto out;
 584         }
 585         list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) {
 586                 if (unlikely(!spin_trylock(&zbpg->lock)))
 587                         continue;
 588                 list_del_init(&zbpg->bud_list);
 589                 zcache_zbud_buddied_count--;
 590                 spin_unlock(&zbud_budlists_spinlock);
 591                 zcache_evicted_buddied_pages++;
 592                 /* want budlists unlocked when doing zbpg eviction */
 593                 zbud_evict_zbpg(zbpg);
 594                 local_bh_enable();
 595                 if (--nr <= 0)
 596                         goto out;
 597                 goto retry_bud_list;
 598         }
 599         spin_unlock_bh(&zbud_budlists_spinlock);
 600 out:
 601         return;
 602 }
 603
 604 static void zbud_init(void)
 605 {
 606         int i;
 607
 608         INIT_LIST_HEAD(&zbud_buddied_list);
 609         zcache_zbud_buddied_count = 0;
 610         for (i = 0; i < NCHUNKS; i++) {
 611                 INIT_LIST_HEAD(&zbud_unbuddied[i].list);
 612                 zbud_unbuddied[i].count = 0;
 613         }
 614 }
 615
 616 #ifdef CONFIG_SYSFS
 617 /*
 618  * These sysfs routines show a nice distribution of how many zbpg's are
 619  * currently (and have ever been placed) in each unbuddied list.  It's fun
 620  * to watch but can probably go away before final merge.
 621  */
 622 static int zbud_show_unbuddied_list_counts(char *buf)
 623 {
 624         int i;
 625         char *p = buf;
 626
 627         for (i = 0; i < NCHUNKS; i++)
 628                 p += sprintf(p, "%u ", zbud_unbuddied[i].count);
 629         return p - buf;
 630 }
 631
 632 static int zbud_show_cumul_chunk_counts(char *buf)
 633 {
 634         unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0;
 635         unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0;
 636         unsigned long total_chunks_lte_42 = 0;
 637         char *p = buf;
 638
 639         for (i = 0; i < NCHUNKS; i++) {
 640                 p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]);
 641                 chunks += zbud_cumul_chunk_counts[i];
 642                 total_chunks += zbud_cumul_chunk_counts[i];
 643                 sum_total_chunks += i * zbud_cumul_chunk_counts[i];
 644                 if (i == 21)
 645                         total_chunks_lte_21 = total_chunks;
 646                 if (i == 32)
 647                         total_chunks_lte_32 = total_chunks;
 648                 if (i == 42)
 649                         total_chunks_lte_42 = total_chunks;
 650         }
 651         p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
 652                 total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42,
 653                 chunks == 0 ? 0 : sum_total_chunks / chunks);
 654         return p - buf;
 655 }
 656 #endif
 657
 658 /**********
 659  * This "zv" PAM implementation combines the slab-based zsmalloc
 660  * with the crypto compression API to maximize the amount of data that can
 661  * be packed into a physical page.
 662  *
 663  * Zv represents a PAM page with the index and object (plus a "size" value
 664  * necessary for decompression) immediately preceding the compressed data.
 665  */
 666
 667 #define ZVH_SENTINEL  0x43214321
 668
 669 struct zv_hdr {
 670         uint32_t pool_id;
 671         struct tmem_oid oid;
 672         uint32_t index;
 673         size_t size;
 674         DECL_SENTINEL
 675 };
 676
 677 /* rudimentary policy limits */
 678 /* total number of persistent pages may not exceed this percentage */
 679 static unsigned int zv_page_count_policy_percent = 75;
 680 /*
 681  * byte count defining poor compression; pages with greater zsize will be
 682  * rejected
 683  */
 684 static unsigned int zv_max_zsize = (PAGE_SIZE / 8) * 7;
 685 /*
 686  * byte count defining poor *mean* compression; pages with greater zsize
 687  * will be rejected until sufficient better-compressed pages are accepted
 688  * driving the mean below this threshold
 689  */
 690 static unsigned int zv_max_mean_zsize = (PAGE_SIZE / 8) * 5;
 691
 692 static atomic_t zv_curr_dist_counts[NCHUNKS];
 693 static atomic_t zv_cumul_dist_counts[NCHUNKS];
 694
 695 static unsigned long zv_create(struct zs_pool *pool, uint32_t pool_id,
 696                                 struct tmem_oid *oid, uint32_t index,
 697                                 void *cdata, unsigned clen)
 698 {
 699         struct zv_hdr *zv;
 700         u32 size = clen + sizeof(struct zv_hdr);
 701         int chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT;
 702         unsigned long handle = 0;
 703
 704         BUG_ON(!irqs_disabled());
 705         BUG_ON(chunks >= NCHUNKS);
 706         handle = zs_malloc(pool, size);
 707         if (!handle)
 708                 goto out;
 709         atomic_inc(&zv_curr_dist_counts[chunks]);
 710         atomic_inc(&zv_cumul_dist_counts[chunks]);
 711         zv = zs_map_object(pool, handle);
 712         zv->index = index;
 713         zv->oid = *oid;
 714         zv->pool_id = pool_id;
 715         zv->size = clen;
 716         SET_SENTINEL(zv, ZVH);
 717         memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen);
 718         zs_unmap_object(pool, handle);
 719 out:
 720         return handle;
 721 }
 722
 723 static void zv_free(struct zs_pool *pool, unsigned long handle)
 724 {
 725         unsigned long flags;
 726         struct zv_hdr *zv;
 727         uint16_t size;
 728         int chunks;
 729
 730         zv = zs_map_object(pool, handle);
 731         ASSERT_SENTINEL(zv, ZVH);
 732         size = zv->size + sizeof(struct zv_hdr);
 733         INVERT_SENTINEL(zv, ZVH);
 734         zs_unmap_object(pool, handle);
 735
 736         chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT;
 737         BUG_ON(chunks >= NCHUNKS);
 738         atomic_dec(&zv_curr_dist_counts[chunks]);
 739
 740         local_irq_save(flags);
 741         zs_free(pool, handle);
 742         local_irq_restore(flags);
 743 }
 744
 745 static void zv_decompress(struct page *page, unsigned long handle)
 746 {
 747         unsigned int clen = PAGE_SIZE;
 748         char *to_va;
 749         int ret;
 750         struct zv_hdr *zv;
 751
 752         zv = zs_map_object(zcache_host.zspool, handle);
 753         BUG_ON(zv->size == 0);
 754         ASSERT_SENTINEL(zv, ZVH);
 755         to_va = kmap_atomic(page);
 756         ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, (char *)zv + sizeof(*zv),
 757                                 zv->size, to_va, &clen);
 758         kunmap_atomic(to_va);
 759         zs_unmap_object(zcache_host.zspool, handle);
 760         BUG_ON(ret);
 761         BUG_ON(clen != PAGE_SIZE);
 762 }
 763
 764 #ifdef CONFIG_SYSFS
 765 /*
 766  * show a distribution of compression stats for zv pages.
 767  */
 768
 769 static int zv_curr_dist_counts_show(char *buf)
 770 {
 771         unsigned long i, n, chunks = 0, sum_total_chunks = 0;
 772         char *p = buf;
 773
 774         for (i = 0; i < NCHUNKS; i++) {
 775                 n = atomic_read(&zv_curr_dist_counts[i]);
 776                 p += sprintf(p, "%lu ", n);
 777                 chunks += n;
 778                 sum_total_chunks += i * n;
 779         }
 780         p += sprintf(p, "mean:%lu\n",
 781                 chunks == 0 ? 0 : sum_total_chunks / chunks);
 782         return p - buf;
 783 }
 784
 785 static int zv_cumul_dist_counts_show(char *buf)
 786 {
 787         unsigned long i, n, chunks = 0, sum_total_chunks = 0;
 788         char *p = buf;
 789
 790         for (i = 0; i < NCHUNKS; i++) {
 791                 n = atomic_read(&zv_cumul_dist_counts[i]);
 792                 p += sprintf(p, "%lu ", n);
 793                 chunks += n;
 794                 sum_total_chunks += i * n;
 795         }
 796         p += sprintf(p, "mean:%lu\n",
 797                 chunks == 0 ? 0 : sum_total_chunks / chunks);
 798         return p - buf;
 799 }
 800
 801 /*
 802  * setting zv_max_zsize via sysfs causes all persistent (e.g. swap)
 803  * pages that don't compress to less than this value (including metadata
 804  * overhead) to be rejected.  We don't allow the value to get too close
 805  * to PAGE_SIZE.
 806  */
 807 static ssize_t zv_max_zsize_show(struct kobject *kobj,
 808                                     struct kobj_attribute *attr,
 809                                     char *buf)
 810 {
 811         return sprintf(buf, "%u\n", zv_max_zsize);
 812 }
 813
 814 static ssize_t zv_max_zsize_store(struct kobject *kobj,
 815                                     struct kobj_attribute *attr,
 816                                     const char *buf, size_t count)
 817 {
 818         unsigned long val;
 819         int err;
 820
 821         if (!capable(CAP_SYS_ADMIN))
 822                 return -EPERM;
 823
 824         err = kstrtoul(buf, 10, &val);
 825         if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7))
 826                 return -EINVAL;
 827         zv_max_zsize = val;
 828         return count;
 829 }
 830
 831 /*
 832  * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap)
 833  * pages that don't compress to less than this value (including metadata
 834  * overhead) to be rejected UNLESS the mean compression is also smaller
 835  * than this value.  In other words, we are load-balancing-by-zsize the
 836  * accepted pages.  Again, we don't allow the value to get too close
 837  * to PAGE_SIZE.
 838  */
 839 static ssize_t zv_max_mean_zsize_show(struct kobject *kobj,
 840                                     struct kobj_attribute *attr,
 841                                     char *buf)
 842 {
 843         return sprintf(buf, "%u\n", zv_max_mean_zsize);
 844 }
 845
 846 static ssize_t zv_max_mean_zsize_store(struct kobject *kobj,
 847                                     struct kobj_attribute *attr,
 848                                     const char *buf, size_t count)
 849 {
 850         unsigned long val;
 851         int err;
 852
 853         if (!capable(CAP_SYS_ADMIN))
 854                 return -EPERM;
 855
 856         err = kstrtoul(buf, 10, &val);
 857         if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7))
 858                 return -EINVAL;
 859         zv_max_mean_zsize = val;
 860         return count;
 861 }
 862
 863 /*
 864  * setting zv_page_count_policy_percent via sysfs sets an upper bound of
 865  * persistent (e.g. swap) pages that will be retained according to:
 866  *     (zv_page_count_policy_percent * totalram_pages) / 100)
 867  * when that limit is reached, further puts will be rejected (until
 868  * some pages have been flushed).  Note that, due to compression,
 869  * this number may exceed 100; it defaults to 75 and we set an
 870  * arbitary limit of 150.  A poor choice will almost certainly result
 871  * in OOM's, so this value should only be changed prudently.
 872  */
 873 static ssize_t zv_page_count_policy_percent_show(struct kobject *kobj,
 874                                                  struct kobj_attribute *attr,
 875                                                  char *buf)
 876 {
 877         return sprintf(buf, "%u\n", zv_page_count_policy_percent);
 878 }
 879
 880 static ssize_t zv_page_count_policy_percent_store(struct kobject *kobj,
 881                                                   struct kobj_attribute *attr,
 882                                                   const char *buf, size_t count)
 883 {
 884         unsigned long val;
 885         int err;
 886
 887         if (!capable(CAP_SYS_ADMIN))
 888                 return -EPERM;
 889
 890         err = kstrtoul(buf, 10, &val);
 891         if (err || (val == 0) || (val > 150))
 892                 return -EINVAL;
 893         zv_page_count_policy_percent = val;
 894         return count;
 895 }
 896
 897 static struct kobj_attribute zcache_zv_max_zsize_attr = {
 898                 .attr = { .name = "zv_max_zsize", .mode = 0644 },
 899                 .show = zv_max_zsize_show,
 900                 .store = zv_max_zsize_store,
 901 };
 902
 903 static struct kobj_attribute zcache_zv_max_mean_zsize_attr = {
 904                 .attr = { .name = "zv_max_mean_zsize", .mode = 0644 },
 905                 .show = zv_max_mean_zsize_show,
 906                 .store = zv_max_mean_zsize_store,
 907 };
 908
 909 static struct kobj_attribute zcache_zv_page_count_policy_percent_attr = {
 910                 .attr = { .name = "zv_page_count_policy_percent",
 911                           .mode = 0644 },
 912                 .show = zv_page_count_policy_percent_show,
 913                 .store = zv_page_count_policy_percent_store,
 914 };
 915 #endif
 916
 917 /*
 918  * zcache core code starts here
 919  */
 920
 921 /* useful stats not collected by cleancache or frontswap */
 922 static unsigned long zcache_flush_total;
 923 static unsigned long zcache_flush_found;
 924 static unsigned long zcache_flobj_total;
 925 static unsigned long zcache_flobj_found;
 926 static unsigned long zcache_failed_eph_puts;
 927 static unsigned long zcache_failed_pers_puts;
 928
 929 /*
 930  * Tmem operations assume the poolid implies the invoking client.
 931  * Zcache only has one client (the kernel itself): LOCAL_CLIENT.
 932  * RAMster has each client numbered by cluster node, and a KVM version
 933  * of zcache would have one client per guest and each client might
 934  * have a poolid==N.
 935  */
 936 static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid)
 937 {
 938         struct tmem_pool *pool = NULL;
 939         struct zcache_client *cli = NULL;
 940
 941         if (cli_id == LOCAL_CLIENT)
 942                 cli = &zcache_host;
 943         else {
 944                 if (cli_id >= MAX_CLIENTS)
 945                         goto out;
 946                 cli = &zcache_clients[cli_id];
 947                 if (cli == NULL)
 948                         goto out;
 949                 atomic_inc(&cli->refcount);
 950         }
 951         pool = idr_find(&cli->tmem_pools, poolid);
 952         if (pool != NULL)
 953                 atomic_inc(&pool->refcount);
 954 out:
 955         return pool;
 956 }
 957
 958 static void zcache_put_pool(struct tmem_pool *pool)
 959 {
 960         struct zcache_client *cli = NULL;
 961
 962         if (pool == NULL)
 963                 BUG();
 964         cli = pool->client;
 965         atomic_dec(&pool->refcount);
 966         atomic_dec(&cli->refcount);
 967 }
 968
 969 int zcache_new_client(uint16_t cli_id)
 970 {
 971         struct zcache_client *cli = NULL;
 972         int ret = -1;
 973
 974         if (cli_id == LOCAL_CLIENT)
 975                 cli = &zcache_host;
 976         else if ((unsigned int)cli_id < MAX_CLIENTS)
 977                 cli = &zcache_clients[cli_id];
 978         if (cli == NULL)
 979                 goto out;
 980         if (cli->allocated)
 981                 goto out;
 982         cli->allocated = 1;
 983 #ifdef CONFIG_FRONTSWAP
 984         cli->zspool = zs_create_pool("zcache", ZCACHE_GFP_MASK);
 985         if (cli->zspool == NULL)
 986                 goto out;
 987         idr_init(&cli->tmem_pools);
 988 #endif
 989         ret = 0;
 990 out:
 991         return ret;
 992 }
 993
 994 /* counters for debugging */
 995 static unsigned long zcache_failed_get_free_pages;
 996 static unsigned long zcache_failed_alloc;
 997 static unsigned long zcache_put_to_flush;
 998
 999 /*
1000  * for now, used named slabs so can easily track usage; later can
1001  * either just use kmalloc, or perhaps add a slab-like allocator
1002  * to more carefully manage total memory utilization
1003  */
1004 static struct kmem_cache *zcache_objnode_cache;
1005 static struct kmem_cache *zcache_obj_cache;
1006 static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0);
1007 static unsigned long zcache_curr_obj_count_max;
1008 static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0);
1009 static unsigned long zcache_curr_objnode_count_max;
1010
1011 /*
1012  * to avoid memory allocation recursion (e.g. due to direct reclaim), we
1013  * preload all necessary data structures so the hostops callbacks never
1014  * actually do a malloc
1015  */
1016 struct zcache_preload {
1017         void *page;
1018         struct tmem_obj *obj;
1019         int nr;
1020         struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH];
1021 };
1022 static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, };
1023
1024 static int zcache_do_preload(struct tmem_pool *pool)
1025 {
1026         struct zcache_preload *kp;
1027         struct tmem_objnode *objnode;
1028         struct tmem_obj *obj;
1029         void *page;
1030         int ret = -ENOMEM;
1031
1032         if (unlikely(zcache_objnode_cache == NULL))
1033                 goto out;
1034         if (unlikely(zcache_obj_cache == NULL))
1035                 goto out;
1036         preempt_disable();
1037         kp = &__get_cpu_var(zcache_preloads);
1038         while (kp->nr < ARRAY_SIZE(kp->objnodes)) {
1039                 preempt_enable_no_resched();
1040                 objnode = kmem_cache_alloc(zcache_objnode_cache,
1041                                 ZCACHE_GFP_MASK);
1042                 if (unlikely(objnode == NULL)) {
1043                         zcache_failed_alloc++;
1044                         goto out;
1045                 }
1046                 preempt_disable();
1047                 kp = &__get_cpu_var(zcache_preloads);
1048                 if (kp->nr < ARRAY_SIZE(kp->objnodes))
1049                         kp->objnodes[kp->nr++] = objnode;
1050                 else
1051                         kmem_cache_free(zcache_objnode_cache, objnode);
1052         }
1053         preempt_enable_no_resched();
1054         obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK);
1055         if (unlikely(obj == NULL)) {
1056                 zcache_failed_alloc++;
1057                 goto out;
1058         }
1059         page = (void *)__get_free_page(ZCACHE_GFP_MASK);
1060         if (unlikely(page == NULL)) {
1061                 zcache_failed_get_free_pages++;
1062                 kmem_cache_free(zcache_obj_cache, obj);
1063                 goto out;
1064         }
1065         preempt_disable();
1066         kp = &__get_cpu_var(zcache_preloads);
1067         if (kp->obj == NULL)
1068                 kp->obj = obj;
1069         else
1070                 kmem_cache_free(zcache_obj_cache, obj);
1071         if (kp->page == NULL)
1072                 kp->page = page;
1073         else
1074                 free_page((unsigned long)page);
1075         ret = 0;
1076 out:
1077         return ret;
1078 }
1079
1080 static void *zcache_get_free_page(void)
1081 {
1082         struct zcache_preload *kp;
1083         void *page;
1084
1085         kp = &__get_cpu_var(zcache_preloads);
1086         page = kp->page;
1087         BUG_ON(page == NULL);
1088         kp->page = NULL;
1089         return page;
1090 }
1091
1092 static void zcache_free_page(void *p)
1093 {
1094         free_page((unsigned long)p);
1095 }
1096
1097 /*
1098  * zcache implementation for tmem host ops
1099  */
1100
1101 static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool)
1102 {
1103         struct tmem_objnode *objnode = NULL;
1104         unsigned long count;
1105         struct zcache_preload *kp;
1106
1107         kp = &__get_cpu_var(zcache_preloads);
1108         if (kp->nr <= 0)
1109                 goto out;
1110         objnode = kp->objnodes[kp->nr - 1];
1111         BUG_ON(objnode == NULL);
1112         kp->objnodes[kp->nr - 1] = NULL;
1113         kp->nr--;
1114         count = atomic_inc_return(&zcache_curr_objnode_count);
1115         if (count > zcache_curr_objnode_count_max)
1116                 zcache_curr_objnode_count_max = count;
1117 out:
1118         return objnode;
1119 }
1120
1121 static void zcache_objnode_free(struct tmem_objnode *objnode,
1122                                         struct tmem_pool *pool)
1123 {
1124         atomic_dec(&zcache_curr_objnode_count);
1125         BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0);
1126         kmem_cache_free(zcache_objnode_cache, objnode);
1127 }
1128
1129 static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool)
1130 {
1131         struct tmem_obj *obj = NULL;
1132         unsigned long count;
1133         struct zcache_preload *kp;
1134
1135         kp = &__get_cpu_var(zcache_preloads);
1136         obj = kp->obj;
1137         BUG_ON(obj == NULL);
1138         kp->obj = NULL;
1139         count = atomic_inc_return(&zcache_curr_obj_count);
1140         if (count > zcache_curr_obj_count_max)
1141                 zcache_curr_obj_count_max = count;
1142         return obj;
1143 }
1144
1145 static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool)
1146 {
1147         atomic_dec(&zcache_curr_obj_count);
1148         BUG_ON(atomic_read(&zcache_curr_obj_count) < 0);
1149         kmem_cache_free(zcache_obj_cache, obj);
1150 }
1151
1152 static struct tmem_hostops zcache_hostops = {
1153         .obj_alloc = zcache_obj_alloc,
1154         .obj_free = zcache_obj_free,
1155         .objnode_alloc = zcache_objnode_alloc,
1156         .objnode_free = zcache_objnode_free,
1157 };
1158
1159 /*
1160  * zcache implementations for PAM page descriptor ops
1161  */
1162
1163 static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0);
1164 static unsigned long zcache_curr_eph_pampd_count_max;
1165 static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0);
1166 static unsigned long zcache_curr_pers_pampd_count_max;
1167
1168 /* forward reference */
1169 static int zcache_compress(struct page *from, void **out_va, unsigned *out_len);
1170
1171 static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph,
1172                                 struct tmem_pool *pool, struct tmem_oid *oid,
1173                                  uint32_t index)
1174 {
1175         void *pampd = NULL, *cdata;
1176         unsigned clen;
1177         int ret;
1178         unsigned long count;
1179         struct page *page = (struct page *)(data);
1180         struct zcache_client *cli = pool->client;
1181         uint16_t client_id = get_client_id_from_client(cli);
1182         unsigned long zv_mean_zsize;
1183         unsigned long curr_pers_pampd_count;
1184         u64 total_zsize;
1185
1186         if (eph) {
1187                 ret = zcache_compress(page, &cdata, &clen);
1188                 if (ret == 0)
1189                         goto out;
1190                 if (clen == 0 || clen > zbud_max_buddy_size()) {
1191                         zcache_compress_poor++;
1192                         goto out;
1193                 }
1194                 pampd = (void *)zbud_create(client_id, pool->pool_id, oid,
1195                                                 index, page, cdata, clen);
1196                 if (pampd != NULL) {
1197                         count = atomic_inc_return(&zcache_curr_eph_pampd_count);
1198                         if (count > zcache_curr_eph_pampd_count_max)
1199                                 zcache_curr_eph_pampd_count_max = count;
1200                 }
1201         } else {
1202                 curr_pers_pampd_count =
1203                         atomic_read(&zcache_curr_pers_pampd_count);
1204                 if (curr_pers_pampd_count >
1205                     (zv_page_count_policy_percent * totalram_pages) / 100)
1206                         goto out;
1207                 ret = zcache_compress(page, &cdata, &clen);
1208                 if (ret == 0)
1209                         goto out;
1210                 /* reject if compression is too poor */
1211                 if (clen > zv_max_zsize) {
1212                         zcache_compress_poor++;
1213                         goto out;
1214                 }
1215                 /* reject if mean compression is too poor */
1216                 if ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) {
1217                         total_zsize = zs_get_total_size_bytes(cli->zspool);
1218                         zv_mean_zsize = div_u64(total_zsize,
1219                                                 curr_pers_pampd_count);
1220                         if (zv_mean_zsize > zv_max_mean_zsize) {
1221                                 zcache_mean_compress_poor++;
1222                                 goto out;
1223                         }
1224                 }
1225                 pampd = (void *)zv_create(cli->zspool, pool->pool_id,
1226                                                 oid, index, cdata, clen);
1227                 if (pampd == NULL)
1228                         goto out;
1229                 count = atomic_inc_return(&zcache_curr_pers_pampd_count);
1230                 if (count > zcache_curr_pers_pampd_count_max)
1231                         zcache_curr_pers_pampd_count_max = count;
1232         }
1233 out:
1234         return pampd;
1235 }
1236
1237 /*
1238  * fill the pageframe corresponding to the struct page with the data
1239  * from the passed pampd
1240  */
1241 static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw,
1242                                         void *pampd, struct tmem_pool *pool,
1243                                         struct tmem_oid *oid, uint32_t index)
1244 {
1245         int ret = 0;
1246
1247         BUG_ON(is_ephemeral(pool));
1248         zv_decompress((struct page *)(data), (unsigned long)pampd);
1249         return ret;
1250 }
1251
1252 /*
1253  * fill the pageframe corresponding to the struct page with the data
1254  * from the passed pampd
1255  */
1256 static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw,
1257                                         void *pampd, struct tmem_pool *pool,
1258                                         struct tmem_oid *oid, uint32_t index)
1259 {
1260         int ret = 0;
1261
1262         BUG_ON(!is_ephemeral(pool));
1263         zbud_decompress((struct page *)(data), pampd);
1264         zbud_free_and_delist((struct zbud_hdr *)pampd);
1265         atomic_dec(&zcache_curr_eph_pampd_count);
1266         return ret;
1267 }
1268
1269 /*
1270  * free the pampd and remove it from any zcache lists
1271  * pampd must no longer be pointed to from any tmem data structures!
1272  */
1273 static void zcache_pampd_free(void *pampd, struct tmem_pool *pool,
1274                                 struct tmem_oid *oid, uint32_t index)
1275 {
1276         struct zcache_client *cli = pool->client;
1277
1278         if (is_ephemeral(pool)) {
1279                 zbud_free_and_delist((struct zbud_hdr *)pampd);
1280                 atomic_dec(&zcache_curr_eph_pampd_count);
1281                 BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0);
1282         } else {
1283                 zv_free(cli->zspool, (unsigned long)pampd);
1284                 atomic_dec(&zcache_curr_pers_pampd_count);
1285                 BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0);
1286         }
1287 }
1288
1289 static void zcache_pampd_free_obj(struct tmem_pool *pool, struct tmem_obj *obj)
1290 {
1291 }
1292
1293 static void zcache_pampd_new_obj(struct tmem_obj *obj)
1294 {
1295 }
1296
1297 static int zcache_pampd_replace_in_obj(void *pampd, struct tmem_obj *obj)
1298 {
1299         return -1;
1300 }
1301
1302 static bool zcache_pampd_is_remote(void *pampd)
1303 {
1304         return 0;
1305 }
1306
1307 static struct tmem_pamops zcache_pamops = {
1308         .create = zcache_pampd_create,
1309         .get_data = zcache_pampd_get_data,
1310         .get_data_and_free = zcache_pampd_get_data_and_free,
1311         .free = zcache_pampd_free,
1312         .free_obj = zcache_pampd_free_obj,
1313         .new_obj = zcache_pampd_new_obj,
1314         .replace_in_obj = zcache_pampd_replace_in_obj,
1315         .is_remote = zcache_pampd_is_remote,
1316 };
1317
1318 /*
1319  * zcache compression/decompression and related per-cpu stuff
1320  */
1321
1322 static DEFINE_PER_CPU(unsigned char *, zcache_dstmem);
1323 #define ZCACHE_DSTMEM_ORDER 1
1324
1325 static int zcache_compress(struct page *from, void **out_va, unsigned *out_len)
1326 {
1327         int ret = 0;
1328         unsigned char *dmem = __get_cpu_var(zcache_dstmem);
1329         char *from_va;
1330
1331         BUG_ON(!irqs_disabled());
1332         if (unlikely(dmem == NULL))
1333                 goto out;  /* no buffer or no compressor so can't compress */
1334         *out_len = PAGE_SIZE << ZCACHE_DSTMEM_ORDER;
1335         from_va = kmap_atomic(from);
1336         mb();
1337         ret = zcache_comp_op(ZCACHE_COMPOP_COMPRESS, from_va, PAGE_SIZE, dmem,
1338                                 out_len);
1339         BUG_ON(ret);
1340         *out_va = dmem;
1341         kunmap_atomic(from_va);
1342         ret = 1;
1343 out:
1344         return ret;
1345 }
1346
1347 static int zcache_comp_cpu_up(int cpu)
1348 {
1349         struct crypto_comp *tfm;
1350
1351         tfm = crypto_alloc_comp(zcache_comp_name, 0, 0);
1352         if (IS_ERR(tfm))
1353                 return NOTIFY_BAD;
1354         *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = tfm;
1355         return NOTIFY_OK;
1356 }
1357
1358 static void zcache_comp_cpu_down(int cpu)
1359 {
1360         struct crypto_comp *tfm;
1361
1362         tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu);
1363         crypto_free_comp(tfm);
1364         *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL;
1365 }
1366
1367 static int zcache_cpu_notifier(struct notifier_block *nb,
1368                                 unsigned long action, void *pcpu)
1369 {
1370         int ret, cpu = (long)pcpu;
1371         struct zcache_preload *kp;
1372
1373         switch (action) {
1374         case CPU_UP_PREPARE:
1375                 ret = zcache_comp_cpu_up(cpu);
1376                 if (ret != NOTIFY_OK) {
1377                         pr_err("zcache: can't allocate compressor transform\n");
1378                         return ret;
1379                 }
1380                 per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages(
1381                         GFP_KERNEL | __GFP_REPEAT, ZCACHE_DSTMEM_ORDER);
1382                 break;
1383         case CPU_DEAD:
1384         case CPU_UP_CANCELED:
1385                 zcache_comp_cpu_down(cpu);
1386                 free_pages((unsigned long)per_cpu(zcache_dstmem, cpu),
1387                         ZCACHE_DSTMEM_ORDER);
1388                 per_cpu(zcache_dstmem, cpu) = NULL;
1389                 kp = &per_cpu(zcache_preloads, cpu);
1390                 while (kp->nr) {
1391                         kmem_cache_free(zcache_objnode_cache,
1392                                         kp->objnodes[kp->nr - 1]);
1393                         kp->objnodes[kp->nr - 1] = NULL;
1394                         kp->nr--;
1395                 }
1396                 if (kp->obj) {
1397                         kmem_cache_free(zcache_obj_cache, kp->obj);
1398                         kp->obj = NULL;
1399                 }
1400                 if (kp->page) {
1401                         free_page((unsigned long)kp->page);
1402                         kp->page = NULL;
1403                 }
1404                 break;
1405         default:
1406                 break;
1407         }
1408         return NOTIFY_OK;
1409 }
1410
1411 static struct notifier_block zcache_cpu_notifier_block = {
1412         .notifier_call = zcache_cpu_notifier
1413 };
1414
1415 #ifdef CONFIG_SYSFS
1416 #define ZCACHE_SYSFS_RO(_name) \
1417         static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1418                                 struct kobj_attribute *attr, char *buf) \
1419         { \
1420                 return sprintf(buf, "%lu\n", zcache_##_name); \
1421         } \
1422         static struct kobj_attribute zcache_##_name##_attr = { \
1423                 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1424                 .show = zcache_##_name##_show, \
1425         }
1426
1427 #define ZCACHE_SYSFS_RO_ATOMIC(_name) \
1428         static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1429                                 struct kobj_attribute *attr, char *buf) \
1430         { \
1431             return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
1432         } \
1433         static struct kobj_attribute zcache_##_name##_attr = { \
1434                 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1435                 .show = zcache_##_name##_show, \
1436         }
1437
1438 #define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
1439         static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1440                                 struct kobj_attribute *attr, char *buf) \
1441         { \
1442             return _func(buf); \
1443         } \
1444         static struct kobj_attribute zcache_##_name##_attr = { \
1445                 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1446                 .show = zcache_##_name##_show, \
1447         }
1448
1449 ZCACHE_SYSFS_RO(curr_obj_count_max);
1450 ZCACHE_SYSFS_RO(curr_objnode_count_max);
1451 ZCACHE_SYSFS_RO(flush_total);
1452 ZCACHE_SYSFS_RO(flush_found);
1453 ZCACHE_SYSFS_RO(flobj_total);
1454 ZCACHE_SYSFS_RO(flobj_found);
1455 ZCACHE_SYSFS_RO(failed_eph_puts);
1456 ZCACHE_SYSFS_RO(failed_pers_puts);
1457 ZCACHE_SYSFS_RO(zbud_curr_zbytes);
1458 ZCACHE_SYSFS_RO(zbud_cumul_zpages);
1459 ZCACHE_SYSFS_RO(zbud_cumul_zbytes);
1460 ZCACHE_SYSFS_RO(zbud_buddied_count);
1461 ZCACHE_SYSFS_RO(zbpg_unused_list_count);
1462 ZCACHE_SYSFS_RO(evicted_raw_pages);
1463 ZCACHE_SYSFS_RO(evicted_unbuddied_pages);
1464 ZCACHE_SYSFS_RO(evicted_buddied_pages);
1465 ZCACHE_SYSFS_RO(failed_get_free_pages);
1466 ZCACHE_SYSFS_RO(failed_alloc);
1467 ZCACHE_SYSFS_RO(put_to_flush);
1468 ZCACHE_SYSFS_RO(compress_poor);
1469 ZCACHE_SYSFS_RO(mean_compress_poor);
1470 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages);
1471 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages);
1472 ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count);
1473 ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count);
1474 ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts,
1475                         zbud_show_unbuddied_list_counts);
1476 ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts,
1477                         zbud_show_cumul_chunk_counts);
1478 ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts,
1479                         zv_curr_dist_counts_show);
1480 ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts,
1481                         zv_cumul_dist_counts_show);
1482
1483 static struct attribute *zcache_attrs[] = {
1484         &zcache_curr_obj_count_attr.attr,
1485         &zcache_curr_obj_count_max_attr.attr,
1486         &zcache_curr_objnode_count_attr.attr,
1487         &zcache_curr_objnode_count_max_attr.attr,
1488         &zcache_flush_total_attr.attr,
1489         &zcache_flobj_total_attr.attr,
1490         &zcache_flush_found_attr.attr,
1491         &zcache_flobj_found_attr.attr,
1492         &zcache_failed_eph_puts_attr.attr,
1493         &zcache_failed_pers_puts_attr.attr,
1494         &zcache_compress_poor_attr.attr,
1495         &zcache_mean_compress_poor_attr.attr,
1496         &zcache_zbud_curr_raw_pages_attr.attr,
1497         &zcache_zbud_curr_zpages_attr.attr,
1498         &zcache_zbud_curr_zbytes_attr.attr,
1499         &zcache_zbud_cumul_zpages_attr.attr,
1500         &zcache_zbud_cumul_zbytes_attr.attr,
1501         &zcache_zbud_buddied_count_attr.attr,
1502         &zcache_zbpg_unused_list_count_attr.attr,
1503         &zcache_evicted_raw_pages_attr.attr,
1504         &zcache_evicted_unbuddied_pages_attr.attr,
1505         &zcache_evicted_buddied_pages_attr.attr,
1506         &zcache_failed_get_free_pages_attr.attr,
1507         &zcache_failed_alloc_attr.attr,
1508         &zcache_put_to_flush_attr.attr,
1509         &zcache_zbud_unbuddied_list_counts_attr.attr,
1510         &zcache_zbud_cumul_chunk_counts_attr.attr,
1511         &zcache_zv_curr_dist_counts_attr.attr,
1512         &zcache_zv_cumul_dist_counts_attr.attr,
1513         &zcache_zv_max_zsize_attr.attr,
1514         &zcache_zv_max_mean_zsize_attr.attr,
1515         &zcache_zv_page_count_policy_percent_attr.attr,
1516         NULL,
1517 };
1518
1519 static struct attribute_group zcache_attr_group = {
1520         .attrs = zcache_attrs,
1521         .name = "zcache",
1522 };
1523
1524 #endif /* CONFIG_SYSFS */
1525 /*
1526  * When zcache is disabled ("frozen"), pools can be created and destroyed,
1527  * but all puts (and thus all other operations that require memory allocation)
1528  * must fail.  If zcache is unfrozen, accepts puts, then frozen again,
1529  * data consistency requires all puts while frozen to be converted into
1530  * flushes.
1531  */
1532 static bool zcache_freeze;
1533
1534 /*
1535  * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
1536  */
1537 static int shrink_zcache_memory(struct shrinker *shrink,
1538                                 struct shrink_control *sc)
1539 {
1540         int ret = -1;
1541         int nr = sc->nr_to_scan;
1542         gfp_t gfp_mask = sc->gfp_mask;
1543
1544         if (nr >= 0) {
1545                 if (!(gfp_mask & __GFP_FS))
1546                         /* does this case really need to be skipped? */
1547                         goto out;
1548                 zbud_evict_pages(nr);
1549         }
1550         ret = (int)atomic_read(&zcache_zbud_curr_raw_pages);
1551 out:
1552         return ret;
1553 }
1554
1555 static struct shrinker zcache_shrinker = {
1556         .shrink = shrink_zcache_memory,
1557         .seeks = DEFAULT_SEEKS,
1558 };
1559
1560 /*
1561  * zcache shims between cleancache/frontswap ops and tmem
1562  */
1563
1564 static int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp,
1565                                 uint32_t index, struct page *page)
1566 {
1567         struct tmem_pool *pool;
1568         int ret = -1;
1569
1570         BUG_ON(!irqs_disabled());
1571         pool = zcache_get_pool_by_id(cli_id, pool_id);
1572         if (unlikely(pool == NULL))
1573                 goto out;
1574         if (!zcache_freeze && zcache_do_preload(pool) == 0) {
1575                 /* preload does preempt_disable on success */
1576                 ret = tmem_put(pool, oidp, index, (char *)(page),
1577                                 PAGE_SIZE, 0, is_ephemeral(pool));
1578                 if (ret < 0) {
1579                         if (is_ephemeral(pool))
1580                                 zcache_failed_eph_puts++;
1581                         else
1582                                 zcache_failed_pers_puts++;
1583                 }
1584                 zcache_put_pool(pool);
1585                 preempt_enable_no_resched();
1586         } else {
1587                 zcache_put_to_flush++;
1588                 if (atomic_read(&pool->obj_count) > 0)
1589                         /* the put fails whether the flush succeeds or not */
1590                         (void)tmem_flush_page(pool, oidp, index);
1591                 zcache_put_pool(pool);
1592         }
1593 out:
1594         return ret;
1595 }
1596
1597 static int zcache_get_page(int cli_id, int pool_id, struct tmem_oid *oidp,
1598                                 uint32_t index, struct page *page)
1599 {
1600         struct tmem_pool *pool;
1601         int ret = -1;
1602         unsigned long flags;
1603         size_t size = PAGE_SIZE;
1604
1605         local_irq_save(flags);
1606         pool = zcache_get_pool_by_id(cli_id, pool_id);
1607         if (likely(pool != NULL)) {
1608                 if (atomic_read(&pool->obj_count) > 0)
1609                         ret = tmem_get(pool, oidp, index, (char *)(page),
1610                                         &size, 0, is_ephemeral(pool));
1611                 zcache_put_pool(pool);
1612         }
1613         local_irq_restore(flags);
1614         return ret;
1615 }
1616
1617 static int zcache_flush_page(int cli_id, int pool_id,
1618                                 struct tmem_oid *oidp, uint32_t index)
1619 {
1620         struct tmem_pool *pool;
1621         int ret = -1;
1622         unsigned long flags;
1623
1624         local_irq_save(flags);
1625         zcache_flush_total++;
1626         pool = zcache_get_pool_by_id(cli_id, pool_id);
1627         if (likely(pool != NULL)) {
1628                 if (atomic_read(&pool->obj_count) > 0)
1629                         ret = tmem_flush_page(pool, oidp, index);
1630                 zcache_put_pool(pool);
1631         }
1632         if (ret >= 0)
1633                 zcache_flush_found++;
1634         local_irq_restore(flags);
1635         return ret;
1636 }
1637
1638 static int zcache_flush_object(int cli_id, int pool_id,
1639                                 struct tmem_oid *oidp)
1640 {
1641         struct tmem_pool *pool;
1642         int ret = -1;
1643         unsigned long flags;
1644
1645         local_irq_save(flags);
1646         zcache_flobj_total++;
1647         pool = zcache_get_pool_by_id(cli_id, pool_id);
1648         if (likely(pool != NULL)) {
1649                 if (atomic_read(&pool->obj_count) > 0)
1650                         ret = tmem_flush_object(pool, oidp);
1651                 zcache_put_pool(pool);
1652         }
1653         if (ret >= 0)
1654                 zcache_flobj_found++;
1655         local_irq_restore(flags);
1656         return ret;
1657 }
1658
1659 static int zcache_destroy_pool(int cli_id, int pool_id)
1660 {
1661         struct tmem_pool *pool = NULL;
1662         struct zcache_client *cli = NULL;
1663         int ret = -1;
1664
1665         if (pool_id < 0)
1666                 goto out;
1667         if (cli_id == LOCAL_CLIENT)
1668                 cli = &zcache_host;
1669         else if ((unsigned int)cli_id < MAX_CLIENTS)
1670                 cli = &zcache_clients[cli_id];
1671         if (cli == NULL)
1672                 goto out;
1673         atomic_inc(&cli->refcount);
1674         pool = idr_find(&cli->tmem_pools, pool_id);
1675         if (pool == NULL)
1676                 goto out;
1677         idr_remove(&cli->tmem_pools, pool_id);
1678         /* wait for pool activity on other cpus to quiesce */
1679         while (atomic_read(&pool->refcount) != 0)
1680                 ;
1681         atomic_dec(&cli->refcount);
1682         local_bh_disable();
1683         ret = tmem_destroy_pool(pool);
1684         local_bh_enable();
1685         kfree(pool);
1686         pr_info("zcache: destroyed pool id=%d, cli_id=%d\n",
1687                         pool_id, cli_id);
1688 out:
1689         return ret;
1690 }
1691
1692 static int zcache_new_pool(uint16_t cli_id, uint32_t flags)
1693 {
1694         int poolid = -1;
1695         struct tmem_pool *pool;
1696         struct zcache_client *cli = NULL;
1697         int r;
1698
1699         if (cli_id == LOCAL_CLIENT)
1700                 cli = &zcache_host;
1701         else if ((unsigned int)cli_id < MAX_CLIENTS)
1702                 cli = &zcache_clients[cli_id];
1703         if (cli == NULL)
1704                 goto out;
1705         atomic_inc(&cli->refcount);
1706         pool = kmalloc(sizeof(struct tmem_pool), GFP_ATOMIC);
1707         if (pool == NULL) {
1708                 pr_info("zcache: pool creation failed: out of memory\n");
1709                 goto out;
1710         }
1711
1712         do {
1713                 r = idr_pre_get(&cli->tmem_pools, GFP_ATOMIC);
1714                 if (r != 1) {
1715                         kfree(pool);
1716                         pr_info("zcache: pool creation failed: out of memory\n");
1717                         goto out;
1718                 }
1719                 r = idr_get_new(&cli->tmem_pools, pool, &poolid);
1720         } while (r == -EAGAIN);
1721         if (r) {
1722                 pr_info("zcache: pool creation failed: error %d\n", r);
1723                 kfree(pool);
1724                 goto out;
1725         }
1726
1727         atomic_set(&pool->refcount, 0);
1728         pool->client = cli;
1729         pool->pool_id = poolid;
1730         tmem_new_pool(pool, flags);
1731         pr_info("zcache: created %s tmem pool, id=%d, client=%d\n",
1732                 flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
1733                 poolid, cli_id);
1734 out:
1735         if (cli != NULL)
1736                 atomic_dec(&cli->refcount);
1737         return poolid;
1738 }
1739
1740 /**********
1741  * Two kernel functionalities currently can be layered on top of tmem.
1742  * These are "cleancache" which is used as a second-chance cache for clean
1743  * page cache pages; and "frontswap" which is used for swap pages
1744  * to avoid writes to disk.  A generic "shim" is provided here for each
1745  * to translate in-kernel semantics to zcache semantics.
1746  */
1747
1748 #ifdef CONFIG_CLEANCACHE
1749 static void zcache_cleancache_put_page(int pool_id,
1750                                         struct cleancache_filekey key,
1751                                         pgoff_t index, struct page *page)
1752 {
1753         u32 ind = (u32) index;
1754         struct tmem_oid oid = *(struct tmem_oid *)&key;
1755
1756         if (likely(ind == index))
1757                 (void)zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index, page);
1758 }
1759
1760 static int zcache_cleancache_get_page(int pool_id,
1761                                         struct cleancache_filekey key,
1762                                         pgoff_t index, struct page *page)
1763 {
1764         u32 ind = (u32) index;
1765         struct tmem_oid oid = *(struct tmem_oid *)&key;
1766         int ret = -1;
1767
1768         if (likely(ind == index))
1769                 ret = zcache_get_page(LOCAL_CLIENT, pool_id, &oid, index, page);
1770         return ret;
1771 }
1772
1773 static void zcache_cleancache_flush_page(int pool_id,
1774                                         struct cleancache_filekey key,
1775                                         pgoff_t index)
1776 {
1777         u32 ind = (u32) index;
1778         struct tmem_oid oid = *(struct tmem_oid *)&key;
1779
1780         if (likely(ind == index))
1781                 (void)zcache_flush_page(LOCAL_CLIENT, pool_id, &oid, ind);
1782 }
1783
1784 static void zcache_cleancache_flush_inode(int pool_id,
1785                                         struct cleancache_filekey key)
1786 {
1787         struct tmem_oid oid = *(struct tmem_oid *)&key;
1788
1789         (void)zcache_flush_object(LOCAL_CLIENT, pool_id, &oid);
1790 }
1791
1792 static void zcache_cleancache_flush_fs(int pool_id)
1793 {
1794         if (pool_id >= 0)
1795                 (void)zcache_destroy_pool(LOCAL_CLIENT, pool_id);
1796 }
1797
1798 static int zcache_cleancache_init_fs(size_t pagesize)
1799 {
1800         BUG_ON(sizeof(struct cleancache_filekey) !=
1801                                 sizeof(struct tmem_oid));
1802         BUG_ON(pagesize != PAGE_SIZE);
1803         return zcache_new_pool(LOCAL_CLIENT, 0);
1804 }
1805
1806 static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
1807 {
1808         /* shared pools are unsupported and map to private */
1809         BUG_ON(sizeof(struct cleancache_filekey) !=
1810                                 sizeof(struct tmem_oid));
1811         BUG_ON(pagesize != PAGE_SIZE);
1812         return zcache_new_pool(LOCAL_CLIENT, 0);
1813 }
1814
1815 static struct cleancache_ops zcache_cleancache_ops = {
1816         .put_page = zcache_cleancache_put_page,
1817         .get_page = zcache_cleancache_get_page,
1818         .invalidate_page = zcache_cleancache_flush_page,
1819         .invalidate_inode = zcache_cleancache_flush_inode,
1820         .invalidate_fs = zcache_cleancache_flush_fs,
1821         .init_shared_fs = zcache_cleancache_init_shared_fs,
1822         .init_fs = zcache_cleancache_init_fs
1823 };
1824
1825 struct cleancache_ops zcache_cleancache_register_ops(void)
1826 {
1827         struct cleancache_ops old_ops =
1828                 cleancache_register_ops(&zcache_cleancache_ops);
1829
1830         return old_ops;
1831 }
1832 #endif
1833
1834 #ifdef CONFIG_FRONTSWAP
1835 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1836 static int zcache_frontswap_poolid = -1;
1837
1838 /*
1839  * Swizzling increases objects per swaptype, increasing tmem concurrency
1840  * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
1841  * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from
1842  * frontswap_get_page(), but has side-effects. Hence using 8.
1843  */
1844 #define SWIZ_BITS               8
1845 #define SWIZ_MASK               ((1 << SWIZ_BITS) - 1)
1846 #define _oswiz(_type, _ind)     ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
1847 #define iswiz(_ind)             (_ind >> SWIZ_BITS)
1848
1849 static inline struct tmem_oid oswiz(unsigned type, u32 ind)
1850 {
1851         struct tmem_oid oid = { .oid = { 0 } };
1852         oid.oid[0] = _oswiz(type, ind);
1853         return oid;
1854 }
1855
1856 static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
1857                                    struct page *page)
1858 {
1859         u64 ind64 = (u64)offset;
1860         u32 ind = (u32)offset;
1861         struct tmem_oid oid = oswiz(type, ind);
1862         int ret = -1;
1863         unsigned long flags;
1864
1865         BUG_ON(!PageLocked(page));
1866         if (likely(ind64 == ind)) {
1867                 local_irq_save(flags);
1868                 ret = zcache_put_page(LOCAL_CLIENT, zcache_frontswap_poolid,
1869                                         &oid, iswiz(ind), page);
1870                 local_irq_restore(flags);
1871         }
1872         return ret;
1873 }
1874
1875 /* returns 0 if the page was successfully gotten from frontswap, -1 if
1876  * was not present (should never happen!) */
1877 static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,
1878                                    struct page *page)
1879 {
1880         u64 ind64 = (u64)offset;
1881         u32 ind = (u32)offset;
1882         struct tmem_oid oid = oswiz(type, ind);
1883         int ret = -1;
1884
1885         BUG_ON(!PageLocked(page));
1886         if (likely(ind64 == ind))
1887                 ret = zcache_get_page(LOCAL_CLIENT, zcache_frontswap_poolid,
1888                                         &oid, iswiz(ind), page);
1889         return ret;
1890 }
1891
1892 /* flush a single page from frontswap */
1893 static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset)
1894 {
1895         u64 ind64 = (u64)offset;
1896         u32 ind = (u32)offset;
1897         struct tmem_oid oid = oswiz(type, ind);
1898
1899         if (likely(ind64 == ind))
1900                 (void)zcache_flush_page(LOCAL_CLIENT, zcache_frontswap_poolid,
1901                                         &oid, iswiz(ind));
1902 }
1903
1904 /* flush all pages from the passed swaptype */
1905 static void zcache_frontswap_flush_area(unsigned type)
1906 {
1907         struct tmem_oid oid;
1908         int ind;
1909
1910         for (ind = SWIZ_MASK; ind >= 0; ind--) {
1911                 oid = oswiz(type, ind);
1912                 (void)zcache_flush_object(LOCAL_CLIENT,
1913                                                 zcache_frontswap_poolid, &oid);
1914         }
1915 }
1916
1917 static void zcache_frontswap_init(unsigned ignored)
1918 {
1919         /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1920         if (zcache_frontswap_poolid < 0)
1921                 zcache_frontswap_poolid =
1922                         zcache_new_pool(LOCAL_CLIENT, TMEM_POOL_PERSIST);
1923 }
1924
1925 static struct frontswap_ops zcache_frontswap_ops = {
1926         .put_page = zcache_frontswap_put_page,
1927         .get_page = zcache_frontswap_get_page,
1928         .invalidate_page = zcache_frontswap_flush_page,
1929         .invalidate_area = zcache_frontswap_flush_area,
1930         .init = zcache_frontswap_init
1931 };
1932
1933 struct frontswap_ops zcache_frontswap_register_ops(void)
1934 {
1935         struct frontswap_ops old_ops =
1936                 frontswap_register_ops(&zcache_frontswap_ops);
1937
1938         return old_ops;
1939 }
1940 #endif
1941
1942 /*
1943  * zcache initialization
1944  * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
1945  * NOTHING HAPPENS!
1946  */
1947
1948 static int zcache_enabled;
1949
1950 static int __init enable_zcache(char *s)
1951 {
1952         zcache_enabled = 1;
1953         return 1;
1954 }
1955 __setup("zcache", enable_zcache);
1956
1957 /* allow independent dynamic disabling of cleancache and frontswap */
1958
1959 static int use_cleancache = 1;
1960
1961 static int __init no_cleancache(char *s)
1962 {
1963         use_cleancache = 0;
1964         return 1;
1965 }
1966
1967 __setup("nocleancache", no_cleancache);
1968
1969 static int use_frontswap = 1;
1970
1971 static int __init no_frontswap(char *s)
1972 {
1973         use_frontswap = 0;
1974         return 1;
1975 }
1976
1977 __setup("nofrontswap", no_frontswap);
1978
1979 static int __init enable_zcache_compressor(char *s)
1980 {
1981         strncpy(zcache_comp_name, s, ZCACHE_COMP_NAME_SZ);
1982         zcache_enabled = 1;
1983         return 1;
1984 }
1985 __setup("zcache=", enable_zcache_compressor);
1986
1987
1988 static int zcache_comp_init(void)
1989 {
1990         int ret = 0;
1991
1992         /* check crypto algorithm */
1993         if (*zcache_comp_name != '\0') {
1994                 ret = crypto_has_comp(zcache_comp_name, 0, 0);
1995                 if (!ret)
1996                         pr_info("zcache: %s not supported\n",
1997                                         zcache_comp_name);
1998         }
1999         if (!ret)
2000                 strcpy(zcache_comp_name, "lzo");
2001         ret = crypto_has_comp(zcache_comp_name, 0, 0);
2002         if (!ret) {
2003                 ret = 1;
2004                 goto out;
2005         }
2006         pr_info("zcache: using %s compressor\n", zcache_comp_name);
2007
2008         /* alloc percpu transforms */
2009         ret = 0;
2010         zcache_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
2011         if (!zcache_comp_pcpu_tfms)
2012                 ret = 1;
2013 out:
2014         return ret;
2015 }
2016
2017 static int __init zcache_init(void)
2018 {
2019         int ret = 0;
2020
2021 #ifdef CONFIG_SYSFS
2022         ret = sysfs_create_group(mm_kobj, &zcache_attr_group);
2023         if (ret) {
2024                 pr_err("zcache: can't create sysfs\n");
2025                 goto out;
2026         }
2027 #endif /* CONFIG_SYSFS */
2028 #if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)
2029         if (zcache_enabled) {
2030                 unsigned int cpu;
2031
2032                 tmem_register_hostops(&zcache_hostops);
2033                 tmem_register_pamops(&zcache_pamops);
2034                 ret = register_cpu_notifier(&zcache_cpu_notifier_block);
2035                 if (ret) {
2036                         pr_err("zcache: can't register cpu notifier\n");
2037                         goto out;
2038                 }
2039                 ret = zcache_comp_init();
2040                 if (ret) {
2041                         pr_err("zcache: compressor initialization failed\n");
2042                         goto out;
2043                 }
2044                 for_each_online_cpu(cpu) {
2045                         void *pcpu = (void *)(long)cpu;
2046                         zcache_cpu_notifier(&zcache_cpu_notifier_block,
2047                                 CPU_UP_PREPARE, pcpu);
2048                 }
2049         }
2050         zcache_objnode_cache = kmem_cache_create("zcache_objnode",
2051                                 sizeof(struct tmem_objnode), 0, 0, NULL);
2052         zcache_obj_cache = kmem_cache_create("zcache_obj",
2053                                 sizeof(struct tmem_obj), 0, 0, NULL);
2054         ret = zcache_new_client(LOCAL_CLIENT);
2055         if (ret) {
2056                 pr_err("zcache: can't create client\n");
2057                 goto out;
2058         }
2059 #endif
2060 #ifdef CONFIG_CLEANCACHE
2061         if (zcache_enabled && use_cleancache) {
2062                 struct cleancache_ops old_ops;
2063
2064                 zbud_init();
2065                 register_shrinker(&zcache_shrinker);
2066                 old_ops = zcache_cleancache_register_ops();
2067                 pr_info("zcache: cleancache enabled using kernel "
2068                         "transcendent memory and compression buddies\n");
2069                 if (old_ops.init_fs != NULL)
2070                         pr_warning("zcache: cleancache_ops overridden");
2071         }
2072 #endif
2073 #ifdef CONFIG_FRONTSWAP
2074         if (zcache_enabled && use_frontswap) {
2075                 struct frontswap_ops old_ops;
2076
2077                 old_ops = zcache_frontswap_register_ops();
2078                 pr_info("zcache: frontswap enabled using kernel "
2079                         "transcendent memory and zsmalloc\n");
2080                 if (old_ops.init != NULL)
2081                         pr_warning("zcache: frontswap_ops overridden");
2082         }
2083 #endif
2084 out:
2085         return ret;
2086 }
2087
2088 module_init(zcache_init)