]> Pileus Git - ~andy/linux/blob - drivers/staging/zcache/zcache-main.c
staging: zcache: don't limit number of pools per client
[~andy/linux] / drivers / staging / zcache / zcache-main.c
1 /*
2  * zcache.c
3  *
4  * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.
5  * Copyright (c) 2010,2011, Nitin Gupta
6  *
7  * Zcache provides an in-kernel "host implementation" for transcendent memory
8  * and, thus indirectly, for cleancache and frontswap.  Zcache includes two
9  * page-accessible memory [1] interfaces, both utilizing the crypto compression
10  * API:
11  * 1) "compression buddies" ("zbud") is used for ephemeral pages
12  * 2) zsmalloc is used for persistent pages.
13  * Xvmalloc (based on the TLSF allocator) has very low fragmentation
14  * so maximizes space efficiency, while zbud allows pairs (and potentially,
15  * in the future, more than a pair of) compressed pages to be closely linked
16  * so that reclaiming can be done via the kernel's physical-page-oriented
17  * "shrinker" interface.
18  *
19  * [1] For a definition of page-accessible memory (aka PAM), see:
20  *   http://marc.info/?l=linux-mm&m=127811271605009
21  */
22
23 #include <linux/module.h>
24 #include <linux/cpu.h>
25 #include <linux/highmem.h>
26 #include <linux/list.h>
27 #include <linux/slab.h>
28 #include <linux/spinlock.h>
29 #include <linux/types.h>
30 #include <linux/atomic.h>
31 #include <linux/math64.h>
32 #include <linux/crypto.h>
33 #include <linux/string.h>
34 #include <linux/idr.h>
35 #include "tmem.h"
36
37 #include "../zsmalloc/zsmalloc.h"
38
39 #if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
40 #error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
41 #endif
42 #ifdef CONFIG_CLEANCACHE
43 #include <linux/cleancache.h>
44 #endif
45 #ifdef CONFIG_FRONTSWAP
46 #include <linux/frontswap.h>
47 #endif
48
49 #if 0
50 /* this is more aggressive but may cause other problems? */
51 #define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
52 #else
53 #define ZCACHE_GFP_MASK \
54         (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
55 #endif
56
57 #define MAX_CLIENTS 16
58 #define LOCAL_CLIENT ((uint16_t)-1)
59
60 MODULE_LICENSE("GPL");
61
62 struct zcache_client {
63         struct idr tmem_pools;
64         struct zs_pool *zspool;
65         bool allocated;
66         atomic_t refcount;
67 };
68
69 static struct zcache_client zcache_host;
70 static struct zcache_client zcache_clients[MAX_CLIENTS];
71
72 static inline uint16_t get_client_id_from_client(struct zcache_client *cli)
73 {
74         BUG_ON(cli == NULL);
75         if (cli == &zcache_host)
76                 return LOCAL_CLIENT;
77         return cli - &zcache_clients[0];
78 }
79
80 static inline bool is_local_client(struct zcache_client *cli)
81 {
82         return cli == &zcache_host;
83 }
84
85 /* crypto API for zcache  */
86 #define ZCACHE_COMP_NAME_SZ CRYPTO_MAX_ALG_NAME
87 static char zcache_comp_name[ZCACHE_COMP_NAME_SZ];
88 static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms;
89
90 enum comp_op {
91         ZCACHE_COMPOP_COMPRESS,
92         ZCACHE_COMPOP_DECOMPRESS
93 };
94
95 static inline int zcache_comp_op(enum comp_op op,
96                                 const u8 *src, unsigned int slen,
97                                 u8 *dst, unsigned int *dlen)
98 {
99         struct crypto_comp *tfm;
100         int ret;
101
102         BUG_ON(!zcache_comp_pcpu_tfms);
103         tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, get_cpu());
104         BUG_ON(!tfm);
105         switch (op) {
106         case ZCACHE_COMPOP_COMPRESS:
107                 ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
108                 break;
109         case ZCACHE_COMPOP_DECOMPRESS:
110                 ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
111                 break;
112         }
113         put_cpu();
114         return ret;
115 }
116
117 /**********
118  * Compression buddies ("zbud") provides for packing two (or, possibly
119  * in the future, more) compressed ephemeral pages into a single "raw"
120  * (physical) page and tracking them with data structures so that
121  * the raw pages can be easily reclaimed.
122  *
123  * A zbud page ("zbpg") is an aligned page containing a list_head,
124  * a lock, and two "zbud headers".  The remainder of the physical
125  * page is divided up into aligned 64-byte "chunks" which contain
126  * the compressed data for zero, one, or two zbuds.  Each zbpg
127  * resides on: (1) an "unused list" if it has no zbuds; (2) a
128  * "buddied" list if it is fully populated  with two zbuds; or
129  * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
130  * the one unbuddied zbud uses.  The data inside a zbpg cannot be
131  * read or written unless the zbpg's lock is held.
132  */
133
134 #define ZBH_SENTINEL  0x43214321
135 #define ZBPG_SENTINEL  0xdeadbeef
136
137 #define ZBUD_MAX_BUDS 2
138
139 struct zbud_hdr {
140         uint16_t client_id;
141         uint16_t pool_id;
142         struct tmem_oid oid;
143         uint32_t index;
144         uint16_t size; /* compressed size in bytes, zero means unused */
145         DECL_SENTINEL
146 };
147
148 struct zbud_page {
149         struct list_head bud_list;
150         spinlock_t lock;
151         struct zbud_hdr buddy[ZBUD_MAX_BUDS];
152         DECL_SENTINEL
153         /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
154 };
155
156 #define CHUNK_SHIFT     6
157 #define CHUNK_SIZE      (1 << CHUNK_SHIFT)
158 #define CHUNK_MASK      (~(CHUNK_SIZE-1))
159 #define NCHUNKS         (((PAGE_SIZE - sizeof(struct zbud_page)) & \
160                                 CHUNK_MASK) >> CHUNK_SHIFT)
161 #define MAX_CHUNK       (NCHUNKS-1)
162
163 static struct {
164         struct list_head list;
165         unsigned count;
166 } zbud_unbuddied[NCHUNKS];
167 /* list N contains pages with N chunks USED and NCHUNKS-N unused */
168 /* element 0 is never used but optimizing that isn't worth it */
169 static unsigned long zbud_cumul_chunk_counts[NCHUNKS];
170
171 struct list_head zbud_buddied_list;
172 static unsigned long zcache_zbud_buddied_count;
173
174 /* protects the buddied list and all unbuddied lists */
175 static DEFINE_SPINLOCK(zbud_budlists_spinlock);
176
177 static LIST_HEAD(zbpg_unused_list);
178 static unsigned long zcache_zbpg_unused_list_count;
179
180 /* protects the unused page list */
181 static DEFINE_SPINLOCK(zbpg_unused_list_spinlock);
182
183 static atomic_t zcache_zbud_curr_raw_pages;
184 static atomic_t zcache_zbud_curr_zpages;
185 static unsigned long zcache_zbud_curr_zbytes;
186 static unsigned long zcache_zbud_cumul_zpages;
187 static unsigned long zcache_zbud_cumul_zbytes;
188 static unsigned long zcache_compress_poor;
189 static unsigned long zcache_mean_compress_poor;
190
191 /* forward references */
192 static void *zcache_get_free_page(void);
193 static void zcache_free_page(void *p);
194
195 /*
196  * zbud helper functions
197  */
198
199 static inline unsigned zbud_max_buddy_size(void)
200 {
201         return MAX_CHUNK << CHUNK_SHIFT;
202 }
203
204 static inline unsigned zbud_size_to_chunks(unsigned size)
205 {
206         BUG_ON(size == 0 || size > zbud_max_buddy_size());
207         return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
208 }
209
210 static inline int zbud_budnum(struct zbud_hdr *zh)
211 {
212         unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1);
213         struct zbud_page *zbpg = NULL;
214         unsigned budnum = -1U;
215         int i;
216
217         for (i = 0; i < ZBUD_MAX_BUDS; i++)
218                 if (offset == offsetof(typeof(*zbpg), buddy[i])) {
219                         budnum = i;
220                         break;
221                 }
222         BUG_ON(budnum == -1U);
223         return budnum;
224 }
225
226 static char *zbud_data(struct zbud_hdr *zh, unsigned size)
227 {
228         struct zbud_page *zbpg;
229         char *p;
230         unsigned budnum;
231
232         ASSERT_SENTINEL(zh, ZBH);
233         budnum = zbud_budnum(zh);
234         BUG_ON(size == 0 || size > zbud_max_buddy_size());
235         zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
236         ASSERT_SPINLOCK(&zbpg->lock);
237         p = (char *)zbpg;
238         if (budnum == 0)
239                 p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) &
240                                                         CHUNK_MASK);
241         else if (budnum == 1)
242                 p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
243         return p;
244 }
245
246 /*
247  * zbud raw page management
248  */
249
250 static struct zbud_page *zbud_alloc_raw_page(void)
251 {
252         struct zbud_page *zbpg = NULL;
253         struct zbud_hdr *zh0, *zh1;
254         bool recycled = 0;
255
256         /* if any pages on the zbpg list, use one */
257         spin_lock(&zbpg_unused_list_spinlock);
258         if (!list_empty(&zbpg_unused_list)) {
259                 zbpg = list_first_entry(&zbpg_unused_list,
260                                 struct zbud_page, bud_list);
261                 list_del_init(&zbpg->bud_list);
262                 zcache_zbpg_unused_list_count--;
263                 recycled = 1;
264         }
265         spin_unlock(&zbpg_unused_list_spinlock);
266         if (zbpg == NULL)
267                 /* none on zbpg list, try to get a kernel page */
268                 zbpg = zcache_get_free_page();
269         if (likely(zbpg != NULL)) {
270                 INIT_LIST_HEAD(&zbpg->bud_list);
271                 zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
272                 spin_lock_init(&zbpg->lock);
273                 if (recycled) {
274                         ASSERT_INVERTED_SENTINEL(zbpg, ZBPG);
275                         SET_SENTINEL(zbpg, ZBPG);
276                         BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
277                         BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
278                 } else {
279                         atomic_inc(&zcache_zbud_curr_raw_pages);
280                         INIT_LIST_HEAD(&zbpg->bud_list);
281                         SET_SENTINEL(zbpg, ZBPG);
282                         zh0->size = 0; zh1->size = 0;
283                         tmem_oid_set_invalid(&zh0->oid);
284                         tmem_oid_set_invalid(&zh1->oid);
285                 }
286         }
287         return zbpg;
288 }
289
290 static void zbud_free_raw_page(struct zbud_page *zbpg)
291 {
292         struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1];
293
294         ASSERT_SENTINEL(zbpg, ZBPG);
295         BUG_ON(!list_empty(&zbpg->bud_list));
296         ASSERT_SPINLOCK(&zbpg->lock);
297         BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
298         BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
299         INVERT_SENTINEL(zbpg, ZBPG);
300         spin_unlock(&zbpg->lock);
301         spin_lock(&zbpg_unused_list_spinlock);
302         list_add(&zbpg->bud_list, &zbpg_unused_list);
303         zcache_zbpg_unused_list_count++;
304         spin_unlock(&zbpg_unused_list_spinlock);
305 }
306
307 /*
308  * core zbud handling routines
309  */
310
311 static unsigned zbud_free(struct zbud_hdr *zh)
312 {
313         unsigned size;
314
315         ASSERT_SENTINEL(zh, ZBH);
316         BUG_ON(!tmem_oid_valid(&zh->oid));
317         size = zh->size;
318         BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
319         zh->size = 0;
320         tmem_oid_set_invalid(&zh->oid);
321         INVERT_SENTINEL(zh, ZBH);
322         zcache_zbud_curr_zbytes -= size;
323         atomic_dec(&zcache_zbud_curr_zpages);
324         return size;
325 }
326
327 static void zbud_free_and_delist(struct zbud_hdr *zh)
328 {
329         unsigned chunks;
330         struct zbud_hdr *zh_other;
331         unsigned budnum = zbud_budnum(zh), size;
332         struct zbud_page *zbpg =
333                 container_of(zh, struct zbud_page, buddy[budnum]);
334
335         spin_lock(&zbud_budlists_spinlock);
336         spin_lock(&zbpg->lock);
337         if (list_empty(&zbpg->bud_list)) {
338                 /* ignore zombie page... see zbud_evict_pages() */
339                 spin_unlock(&zbpg->lock);
340                 spin_unlock(&zbud_budlists_spinlock);
341                 return;
342         }
343         size = zbud_free(zh);
344         ASSERT_SPINLOCK(&zbpg->lock);
345         zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0];
346         if (zh_other->size == 0) { /* was unbuddied: unlist and free */
347                 chunks = zbud_size_to_chunks(size) ;
348                 BUG_ON(list_empty(&zbud_unbuddied[chunks].list));
349                 list_del_init(&zbpg->bud_list);
350                 zbud_unbuddied[chunks].count--;
351                 spin_unlock(&zbud_budlists_spinlock);
352                 zbud_free_raw_page(zbpg);
353         } else { /* was buddied: move remaining buddy to unbuddied list */
354                 chunks = zbud_size_to_chunks(zh_other->size) ;
355                 list_del_init(&zbpg->bud_list);
356                 zcache_zbud_buddied_count--;
357                 list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list);
358                 zbud_unbuddied[chunks].count++;
359                 spin_unlock(&zbud_budlists_spinlock);
360                 spin_unlock(&zbpg->lock);
361         }
362 }
363
364 static struct zbud_hdr *zbud_create(uint16_t client_id, uint16_t pool_id,
365                                         struct tmem_oid *oid,
366                                         uint32_t index, struct page *page,
367                                         void *cdata, unsigned size)
368 {
369         struct zbud_hdr *zh0, *zh1, *zh = NULL;
370         struct zbud_page *zbpg = NULL, *ztmp;
371         unsigned nchunks;
372         char *to;
373         int i, found_good_buddy = 0;
374
375         nchunks = zbud_size_to_chunks(size) ;
376         for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
377                 spin_lock(&zbud_budlists_spinlock);
378                 if (!list_empty(&zbud_unbuddied[i].list)) {
379                         list_for_each_entry_safe(zbpg, ztmp,
380                                     &zbud_unbuddied[i].list, bud_list) {
381                                 if (spin_trylock(&zbpg->lock)) {
382                                         found_good_buddy = i;
383                                         goto found_unbuddied;
384                                 }
385                         }
386                 }
387                 spin_unlock(&zbud_budlists_spinlock);
388         }
389         /* didn't find a good buddy, try allocating a new page */
390         zbpg = zbud_alloc_raw_page();
391         if (unlikely(zbpg == NULL))
392                 goto out;
393         /* ok, have a page, now compress the data before taking locks */
394         spin_lock(&zbud_budlists_spinlock);
395         spin_lock(&zbpg->lock);
396         list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list);
397         zbud_unbuddied[nchunks].count++;
398         zh = &zbpg->buddy[0];
399         goto init_zh;
400
401 found_unbuddied:
402         ASSERT_SPINLOCK(&zbpg->lock);
403         zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
404         BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0)));
405         if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */
406                 ASSERT_SENTINEL(zh0, ZBH);
407                 zh = zh1;
408         } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */
409                 ASSERT_SENTINEL(zh1, ZBH);
410                 zh = zh0;
411         } else
412                 BUG();
413         list_del_init(&zbpg->bud_list);
414         zbud_unbuddied[found_good_buddy].count--;
415         list_add_tail(&zbpg->bud_list, &zbud_buddied_list);
416         zcache_zbud_buddied_count++;
417
418 init_zh:
419         SET_SENTINEL(zh, ZBH);
420         zh->size = size;
421         zh->index = index;
422         zh->oid = *oid;
423         zh->pool_id = pool_id;
424         zh->client_id = client_id;
425         to = zbud_data(zh, size);
426         memcpy(to, cdata, size);
427         spin_unlock(&zbpg->lock);
428         spin_unlock(&zbud_budlists_spinlock);
429
430         zbud_cumul_chunk_counts[nchunks]++;
431         atomic_inc(&zcache_zbud_curr_zpages);
432         zcache_zbud_cumul_zpages++;
433         zcache_zbud_curr_zbytes += size;
434         zcache_zbud_cumul_zbytes += size;
435 out:
436         return zh;
437 }
438
439 static int zbud_decompress(struct page *page, struct zbud_hdr *zh)
440 {
441         struct zbud_page *zbpg;
442         unsigned budnum = zbud_budnum(zh);
443         unsigned int out_len = PAGE_SIZE;
444         char *to_va, *from_va;
445         unsigned size;
446         int ret = 0;
447
448         zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
449         spin_lock(&zbpg->lock);
450         if (list_empty(&zbpg->bud_list)) {
451                 /* ignore zombie page... see zbud_evict_pages() */
452                 ret = -EINVAL;
453                 goto out;
454         }
455         ASSERT_SENTINEL(zh, ZBH);
456         BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
457         to_va = kmap_atomic(page);
458         size = zh->size;
459         from_va = zbud_data(zh, size);
460         ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, from_va, size,
461                                 to_va, &out_len);
462         BUG_ON(ret);
463         BUG_ON(out_len != PAGE_SIZE);
464         kunmap_atomic(to_va);
465 out:
466         spin_unlock(&zbpg->lock);
467         return ret;
468 }
469
470 /*
471  * The following routines handle shrinking of ephemeral pages by evicting
472  * pages "least valuable" first.
473  */
474
475 static unsigned long zcache_evicted_raw_pages;
476 static unsigned long zcache_evicted_buddied_pages;
477 static unsigned long zcache_evicted_unbuddied_pages;
478
479 static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id,
480                                                 uint16_t poolid);
481 static void zcache_put_pool(struct tmem_pool *pool);
482
483 /*
484  * Flush and free all zbuds in a zbpg, then free the pageframe
485  */
486 static void zbud_evict_zbpg(struct zbud_page *zbpg)
487 {
488         struct zbud_hdr *zh;
489         int i, j;
490         uint32_t pool_id[ZBUD_MAX_BUDS], client_id[ZBUD_MAX_BUDS];
491         uint32_t index[ZBUD_MAX_BUDS];
492         struct tmem_oid oid[ZBUD_MAX_BUDS];
493         struct tmem_pool *pool;
494
495         ASSERT_SPINLOCK(&zbpg->lock);
496         BUG_ON(!list_empty(&zbpg->bud_list));
497         for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) {
498                 zh = &zbpg->buddy[i];
499                 if (zh->size) {
500                         client_id[j] = zh->client_id;
501                         pool_id[j] = zh->pool_id;
502                         oid[j] = zh->oid;
503                         index[j] = zh->index;
504                         j++;
505                         zbud_free(zh);
506                 }
507         }
508         spin_unlock(&zbpg->lock);
509         for (i = 0; i < j; i++) {
510                 pool = zcache_get_pool_by_id(client_id[i], pool_id[i]);
511                 if (pool != NULL) {
512                         tmem_flush_page(pool, &oid[i], index[i]);
513                         zcache_put_pool(pool);
514                 }
515         }
516         ASSERT_SENTINEL(zbpg, ZBPG);
517         spin_lock(&zbpg->lock);
518         zbud_free_raw_page(zbpg);
519 }
520
521 /*
522  * Free nr pages.  This code is funky because we want to hold the locks
523  * protecting various lists for as short a time as possible, and in some
524  * circumstances the list may change asynchronously when the list lock is
525  * not held.  In some cases we also trylock not only to avoid waiting on a
526  * page in use by another cpu, but also to avoid potential deadlock due to
527  * lock inversion.
528  */
529 static void zbud_evict_pages(int nr)
530 {
531         struct zbud_page *zbpg;
532         int i;
533
534         /* first try freeing any pages on unused list */
535 retry_unused_list:
536         spin_lock_bh(&zbpg_unused_list_spinlock);
537         if (!list_empty(&zbpg_unused_list)) {
538                 /* can't walk list here, since it may change when unlocked */
539                 zbpg = list_first_entry(&zbpg_unused_list,
540                                 struct zbud_page, bud_list);
541                 list_del_init(&zbpg->bud_list);
542                 zcache_zbpg_unused_list_count--;
543                 atomic_dec(&zcache_zbud_curr_raw_pages);
544                 spin_unlock_bh(&zbpg_unused_list_spinlock);
545                 zcache_free_page(zbpg);
546                 zcache_evicted_raw_pages++;
547                 if (--nr <= 0)
548                         goto out;
549                 goto retry_unused_list;
550         }
551         spin_unlock_bh(&zbpg_unused_list_spinlock);
552
553         /* now try freeing unbuddied pages, starting with least space avail */
554         for (i = 0; i < MAX_CHUNK; i++) {
555 retry_unbud_list_i:
556                 spin_lock_bh(&zbud_budlists_spinlock);
557                 if (list_empty(&zbud_unbuddied[i].list)) {
558                         spin_unlock_bh(&zbud_budlists_spinlock);
559                         continue;
560                 }
561                 list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) {
562                         if (unlikely(!spin_trylock(&zbpg->lock)))
563                                 continue;
564                         list_del_init(&zbpg->bud_list);
565                         zbud_unbuddied[i].count--;
566                         spin_unlock(&zbud_budlists_spinlock);
567                         zcache_evicted_unbuddied_pages++;
568                         /* want budlists unlocked when doing zbpg eviction */
569                         zbud_evict_zbpg(zbpg);
570                         local_bh_enable();
571                         if (--nr <= 0)
572                                 goto out;
573                         goto retry_unbud_list_i;
574                 }
575                 spin_unlock_bh(&zbud_budlists_spinlock);
576         }
577
578         /* as a last resort, free buddied pages */
579 retry_bud_list:
580         spin_lock_bh(&zbud_budlists_spinlock);
581         if (list_empty(&zbud_buddied_list)) {
582                 spin_unlock_bh(&zbud_budlists_spinlock);
583                 goto out;
584         }
585         list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) {
586                 if (unlikely(!spin_trylock(&zbpg->lock)))
587                         continue;
588                 list_del_init(&zbpg->bud_list);
589                 zcache_zbud_buddied_count--;
590                 spin_unlock(&zbud_budlists_spinlock);
591                 zcache_evicted_buddied_pages++;
592                 /* want budlists unlocked when doing zbpg eviction */
593                 zbud_evict_zbpg(zbpg);
594                 local_bh_enable();
595                 if (--nr <= 0)
596                         goto out;
597                 goto retry_bud_list;
598         }
599         spin_unlock_bh(&zbud_budlists_spinlock);
600 out:
601         return;
602 }
603
604 static void zbud_init(void)
605 {
606         int i;
607
608         INIT_LIST_HEAD(&zbud_buddied_list);
609         zcache_zbud_buddied_count = 0;
610         for (i = 0; i < NCHUNKS; i++) {
611                 INIT_LIST_HEAD(&zbud_unbuddied[i].list);
612                 zbud_unbuddied[i].count = 0;
613         }
614 }
615
616 #ifdef CONFIG_SYSFS
617 /*
618  * These sysfs routines show a nice distribution of how many zbpg's are
619  * currently (and have ever been placed) in each unbuddied list.  It's fun
620  * to watch but can probably go away before final merge.
621  */
622 static int zbud_show_unbuddied_list_counts(char *buf)
623 {
624         int i;
625         char *p = buf;
626
627         for (i = 0; i < NCHUNKS; i++)
628                 p += sprintf(p, "%u ", zbud_unbuddied[i].count);
629         return p - buf;
630 }
631
632 static int zbud_show_cumul_chunk_counts(char *buf)
633 {
634         unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0;
635         unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0;
636         unsigned long total_chunks_lte_42 = 0;
637         char *p = buf;
638
639         for (i = 0; i < NCHUNKS; i++) {
640                 p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]);
641                 chunks += zbud_cumul_chunk_counts[i];
642                 total_chunks += zbud_cumul_chunk_counts[i];
643                 sum_total_chunks += i * zbud_cumul_chunk_counts[i];
644                 if (i == 21)
645                         total_chunks_lte_21 = total_chunks;
646                 if (i == 32)
647                         total_chunks_lte_32 = total_chunks;
648                 if (i == 42)
649                         total_chunks_lte_42 = total_chunks;
650         }
651         p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
652                 total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42,
653                 chunks == 0 ? 0 : sum_total_chunks / chunks);
654         return p - buf;
655 }
656 #endif
657
658 /**********
659  * This "zv" PAM implementation combines the slab-based zsmalloc
660  * with the crypto compression API to maximize the amount of data that can
661  * be packed into a physical page.
662  *
663  * Zv represents a PAM page with the index and object (plus a "size" value
664  * necessary for decompression) immediately preceding the compressed data.
665  */
666
667 #define ZVH_SENTINEL  0x43214321
668
669 struct zv_hdr {
670         uint32_t pool_id;
671         struct tmem_oid oid;
672         uint32_t index;
673         size_t size;
674         DECL_SENTINEL
675 };
676
677 /* rudimentary policy limits */
678 /* total number of persistent pages may not exceed this percentage */
679 static unsigned int zv_page_count_policy_percent = 75;
680 /*
681  * byte count defining poor compression; pages with greater zsize will be
682  * rejected
683  */
684 static unsigned int zv_max_zsize = (PAGE_SIZE / 8) * 7;
685 /*
686  * byte count defining poor *mean* compression; pages with greater zsize
687  * will be rejected until sufficient better-compressed pages are accepted
688  * driving the mean below this threshold
689  */
690 static unsigned int zv_max_mean_zsize = (PAGE_SIZE / 8) * 5;
691
692 static atomic_t zv_curr_dist_counts[NCHUNKS];
693 static atomic_t zv_cumul_dist_counts[NCHUNKS];
694
695 static unsigned long zv_create(struct zs_pool *pool, uint32_t pool_id,
696                                 struct tmem_oid *oid, uint32_t index,
697                                 void *cdata, unsigned clen)
698 {
699         struct zv_hdr *zv;
700         u32 size = clen + sizeof(struct zv_hdr);
701         int chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT;
702         unsigned long handle = 0;
703
704         BUG_ON(!irqs_disabled());
705         BUG_ON(chunks >= NCHUNKS);
706         handle = zs_malloc(pool, size);
707         if (!handle)
708                 goto out;
709         atomic_inc(&zv_curr_dist_counts[chunks]);
710         atomic_inc(&zv_cumul_dist_counts[chunks]);
711         zv = zs_map_object(pool, handle);
712         zv->index = index;
713         zv->oid = *oid;
714         zv->pool_id = pool_id;
715         zv->size = clen;
716         SET_SENTINEL(zv, ZVH);
717         memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen);
718         zs_unmap_object(pool, handle);
719 out:
720         return handle;
721 }
722
723 static void zv_free(struct zs_pool *pool, unsigned long handle)
724 {
725         unsigned long flags;
726         struct zv_hdr *zv;
727         uint16_t size;
728         int chunks;
729
730         zv = zs_map_object(pool, handle);
731         ASSERT_SENTINEL(zv, ZVH);
732         size = zv->size + sizeof(struct zv_hdr);
733         INVERT_SENTINEL(zv, ZVH);
734         zs_unmap_object(pool, handle);
735
736         chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT;
737         BUG_ON(chunks >= NCHUNKS);
738         atomic_dec(&zv_curr_dist_counts[chunks]);
739
740         local_irq_save(flags);
741         zs_free(pool, handle);
742         local_irq_restore(flags);
743 }
744
745 static void zv_decompress(struct page *page, unsigned long handle)
746 {
747         unsigned int clen = PAGE_SIZE;
748         char *to_va;
749         int ret;
750         struct zv_hdr *zv;
751
752         zv = zs_map_object(zcache_host.zspool, handle);
753         BUG_ON(zv->size == 0);
754         ASSERT_SENTINEL(zv, ZVH);
755         to_va = kmap_atomic(page);
756         ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, (char *)zv + sizeof(*zv),
757                                 zv->size, to_va, &clen);
758         kunmap_atomic(to_va);
759         zs_unmap_object(zcache_host.zspool, handle);
760         BUG_ON(ret);
761         BUG_ON(clen != PAGE_SIZE);
762 }
763
764 #ifdef CONFIG_SYSFS
765 /*
766  * show a distribution of compression stats for zv pages.
767  */
768
769 static int zv_curr_dist_counts_show(char *buf)
770 {
771         unsigned long i, n, chunks = 0, sum_total_chunks = 0;
772         char *p = buf;
773
774         for (i = 0; i < NCHUNKS; i++) {
775                 n = atomic_read(&zv_curr_dist_counts[i]);
776                 p += sprintf(p, "%lu ", n);
777                 chunks += n;
778                 sum_total_chunks += i * n;
779         }
780         p += sprintf(p, "mean:%lu\n",
781                 chunks == 0 ? 0 : sum_total_chunks / chunks);
782         return p - buf;
783 }
784
785 static int zv_cumul_dist_counts_show(char *buf)
786 {
787         unsigned long i, n, chunks = 0, sum_total_chunks = 0;
788         char *p = buf;
789
790         for (i = 0; i < NCHUNKS; i++) {
791                 n = atomic_read(&zv_cumul_dist_counts[i]);
792                 p += sprintf(p, "%lu ", n);
793                 chunks += n;
794                 sum_total_chunks += i * n;
795         }
796         p += sprintf(p, "mean:%lu\n",
797                 chunks == 0 ? 0 : sum_total_chunks / chunks);
798         return p - buf;
799 }
800
801 /*
802  * setting zv_max_zsize via sysfs causes all persistent (e.g. swap)
803  * pages that don't compress to less than this value (including metadata
804  * overhead) to be rejected.  We don't allow the value to get too close
805  * to PAGE_SIZE.
806  */
807 static ssize_t zv_max_zsize_show(struct kobject *kobj,
808                                     struct kobj_attribute *attr,
809                                     char *buf)
810 {
811         return sprintf(buf, "%u\n", zv_max_zsize);
812 }
813
814 static ssize_t zv_max_zsize_store(struct kobject *kobj,
815                                     struct kobj_attribute *attr,
816                                     const char *buf, size_t count)
817 {
818         unsigned long val;
819         int err;
820
821         if (!capable(CAP_SYS_ADMIN))
822                 return -EPERM;
823
824         err = kstrtoul(buf, 10, &val);
825         if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7))
826                 return -EINVAL;
827         zv_max_zsize = val;
828         return count;
829 }
830
831 /*
832  * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap)
833  * pages that don't compress to less than this value (including metadata
834  * overhead) to be rejected UNLESS the mean compression is also smaller
835  * than this value.  In other words, we are load-balancing-by-zsize the
836  * accepted pages.  Again, we don't allow the value to get too close
837  * to PAGE_SIZE.
838  */
839 static ssize_t zv_max_mean_zsize_show(struct kobject *kobj,
840                                     struct kobj_attribute *attr,
841                                     char *buf)
842 {
843         return sprintf(buf, "%u\n", zv_max_mean_zsize);
844 }
845
846 static ssize_t zv_max_mean_zsize_store(struct kobject *kobj,
847                                     struct kobj_attribute *attr,
848                                     const char *buf, size_t count)
849 {
850         unsigned long val;
851         int err;
852
853         if (!capable(CAP_SYS_ADMIN))
854                 return -EPERM;
855
856         err = kstrtoul(buf, 10, &val);
857         if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7))
858                 return -EINVAL;
859         zv_max_mean_zsize = val;
860         return count;
861 }
862
863 /*
864  * setting zv_page_count_policy_percent via sysfs sets an upper bound of
865  * persistent (e.g. swap) pages that will be retained according to:
866  *     (zv_page_count_policy_percent * totalram_pages) / 100)
867  * when that limit is reached, further puts will be rejected (until
868  * some pages have been flushed).  Note that, due to compression,
869  * this number may exceed 100; it defaults to 75 and we set an
870  * arbitary limit of 150.  A poor choice will almost certainly result
871  * in OOM's, so this value should only be changed prudently.
872  */
873 static ssize_t zv_page_count_policy_percent_show(struct kobject *kobj,
874                                                  struct kobj_attribute *attr,
875                                                  char *buf)
876 {
877         return sprintf(buf, "%u\n", zv_page_count_policy_percent);
878 }
879
880 static ssize_t zv_page_count_policy_percent_store(struct kobject *kobj,
881                                                   struct kobj_attribute *attr,
882                                                   const char *buf, size_t count)
883 {
884         unsigned long val;
885         int err;
886
887         if (!capable(CAP_SYS_ADMIN))
888                 return -EPERM;
889
890         err = kstrtoul(buf, 10, &val);
891         if (err || (val == 0) || (val > 150))
892                 return -EINVAL;
893         zv_page_count_policy_percent = val;
894         return count;
895 }
896
897 static struct kobj_attribute zcache_zv_max_zsize_attr = {
898                 .attr = { .name = "zv_max_zsize", .mode = 0644 },
899                 .show = zv_max_zsize_show,
900                 .store = zv_max_zsize_store,
901 };
902
903 static struct kobj_attribute zcache_zv_max_mean_zsize_attr = {
904                 .attr = { .name = "zv_max_mean_zsize", .mode = 0644 },
905                 .show = zv_max_mean_zsize_show,
906                 .store = zv_max_mean_zsize_store,
907 };
908
909 static struct kobj_attribute zcache_zv_page_count_policy_percent_attr = {
910                 .attr = { .name = "zv_page_count_policy_percent",
911                           .mode = 0644 },
912                 .show = zv_page_count_policy_percent_show,
913                 .store = zv_page_count_policy_percent_store,
914 };
915 #endif
916
917 /*
918  * zcache core code starts here
919  */
920
921 /* useful stats not collected by cleancache or frontswap */
922 static unsigned long zcache_flush_total;
923 static unsigned long zcache_flush_found;
924 static unsigned long zcache_flobj_total;
925 static unsigned long zcache_flobj_found;
926 static unsigned long zcache_failed_eph_puts;
927 static unsigned long zcache_failed_pers_puts;
928
929 /*
930  * Tmem operations assume the poolid implies the invoking client.
931  * Zcache only has one client (the kernel itself): LOCAL_CLIENT.
932  * RAMster has each client numbered by cluster node, and a KVM version
933  * of zcache would have one client per guest and each client might
934  * have a poolid==N.
935  */
936 static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid)
937 {
938         struct tmem_pool *pool = NULL;
939         struct zcache_client *cli = NULL;
940
941         if (cli_id == LOCAL_CLIENT)
942                 cli = &zcache_host;
943         else {
944                 if (cli_id >= MAX_CLIENTS)
945                         goto out;
946                 cli = &zcache_clients[cli_id];
947                 if (cli == NULL)
948                         goto out;
949                 atomic_inc(&cli->refcount);
950         }
951         pool = idr_find(&cli->tmem_pools, poolid);
952         if (pool != NULL)
953                 atomic_inc(&pool->refcount);
954 out:
955         return pool;
956 }
957
958 static void zcache_put_pool(struct tmem_pool *pool)
959 {
960         struct zcache_client *cli = NULL;
961
962         if (pool == NULL)
963                 BUG();
964         cli = pool->client;
965         atomic_dec(&pool->refcount);
966         atomic_dec(&cli->refcount);
967 }
968
969 int zcache_new_client(uint16_t cli_id)
970 {
971         struct zcache_client *cli = NULL;
972         int ret = -1;
973
974         if (cli_id == LOCAL_CLIENT)
975                 cli = &zcache_host;
976         else if ((unsigned int)cli_id < MAX_CLIENTS)
977                 cli = &zcache_clients[cli_id];
978         if (cli == NULL)
979                 goto out;
980         if (cli->allocated)
981                 goto out;
982         cli->allocated = 1;
983 #ifdef CONFIG_FRONTSWAP
984         cli->zspool = zs_create_pool("zcache", ZCACHE_GFP_MASK);
985         if (cli->zspool == NULL)
986                 goto out;
987         idr_init(&cli->tmem_pools);
988 #endif
989         ret = 0;
990 out:
991         return ret;
992 }
993
994 /* counters for debugging */
995 static unsigned long zcache_failed_get_free_pages;
996 static unsigned long zcache_failed_alloc;
997 static unsigned long zcache_put_to_flush;
998
999 /*
1000  * for now, used named slabs so can easily track usage; later can
1001  * either just use kmalloc, or perhaps add a slab-like allocator
1002  * to more carefully manage total memory utilization
1003  */
1004 static struct kmem_cache *zcache_objnode_cache;
1005 static struct kmem_cache *zcache_obj_cache;
1006 static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0);
1007 static unsigned long zcache_curr_obj_count_max;
1008 static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0);
1009 static unsigned long zcache_curr_objnode_count_max;
1010
1011 /*
1012  * to avoid memory allocation recursion (e.g. due to direct reclaim), we
1013  * preload all necessary data structures so the hostops callbacks never
1014  * actually do a malloc
1015  */
1016 struct zcache_preload {
1017         void *page;
1018         struct tmem_obj *obj;
1019         int nr;
1020         struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH];
1021 };
1022 static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, };
1023
1024 static int zcache_do_preload(struct tmem_pool *pool)
1025 {
1026         struct zcache_preload *kp;
1027         struct tmem_objnode *objnode;
1028         struct tmem_obj *obj;
1029         void *page;
1030         int ret = -ENOMEM;
1031
1032         if (unlikely(zcache_objnode_cache == NULL))
1033                 goto out;
1034         if (unlikely(zcache_obj_cache == NULL))
1035                 goto out;
1036         preempt_disable();
1037         kp = &__get_cpu_var(zcache_preloads);
1038         while (kp->nr < ARRAY_SIZE(kp->objnodes)) {
1039                 preempt_enable_no_resched();
1040                 objnode = kmem_cache_alloc(zcache_objnode_cache,
1041                                 ZCACHE_GFP_MASK);
1042                 if (unlikely(objnode == NULL)) {
1043                         zcache_failed_alloc++;
1044                         goto out;
1045                 }
1046                 preempt_disable();
1047                 kp = &__get_cpu_var(zcache_preloads);
1048                 if (kp->nr < ARRAY_SIZE(kp->objnodes))
1049                         kp->objnodes[kp->nr++] = objnode;
1050                 else
1051                         kmem_cache_free(zcache_objnode_cache, objnode);
1052         }
1053         preempt_enable_no_resched();
1054         obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK);
1055         if (unlikely(obj == NULL)) {
1056                 zcache_failed_alloc++;
1057                 goto out;
1058         }
1059         page = (void *)__get_free_page(ZCACHE_GFP_MASK);
1060         if (unlikely(page == NULL)) {
1061                 zcache_failed_get_free_pages++;
1062                 kmem_cache_free(zcache_obj_cache, obj);
1063                 goto out;
1064         }
1065         preempt_disable();
1066         kp = &__get_cpu_var(zcache_preloads);
1067         if (kp->obj == NULL)
1068                 kp->obj = obj;
1069         else
1070                 kmem_cache_free(zcache_obj_cache, obj);
1071         if (kp->page == NULL)
1072                 kp->page = page;
1073         else
1074                 free_page((unsigned long)page);
1075         ret = 0;
1076 out:
1077         return ret;
1078 }
1079
1080 static void *zcache_get_free_page(void)
1081 {
1082         struct zcache_preload *kp;
1083         void *page;
1084
1085         kp = &__get_cpu_var(zcache_preloads);
1086         page = kp->page;
1087         BUG_ON(page == NULL);
1088         kp->page = NULL;
1089         return page;
1090 }
1091
1092 static void zcache_free_page(void *p)
1093 {
1094         free_page((unsigned long)p);
1095 }
1096
1097 /*
1098  * zcache implementation for tmem host ops
1099  */
1100
1101 static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool)
1102 {
1103         struct tmem_objnode *objnode = NULL;
1104         unsigned long count;
1105         struct zcache_preload *kp;
1106
1107         kp = &__get_cpu_var(zcache_preloads);
1108         if (kp->nr <= 0)
1109                 goto out;
1110         objnode = kp->objnodes[kp->nr - 1];
1111         BUG_ON(objnode == NULL);
1112         kp->objnodes[kp->nr - 1] = NULL;
1113         kp->nr--;
1114         count = atomic_inc_return(&zcache_curr_objnode_count);
1115         if (count > zcache_curr_objnode_count_max)
1116                 zcache_curr_objnode_count_max = count;
1117 out:
1118         return objnode;
1119 }
1120
1121 static void zcache_objnode_free(struct tmem_objnode *objnode,
1122                                         struct tmem_pool *pool)
1123 {
1124         atomic_dec(&zcache_curr_objnode_count);
1125         BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0);
1126         kmem_cache_free(zcache_objnode_cache, objnode);
1127 }
1128
1129 static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool)
1130 {
1131         struct tmem_obj *obj = NULL;
1132         unsigned long count;
1133         struct zcache_preload *kp;
1134
1135         kp = &__get_cpu_var(zcache_preloads);
1136         obj = kp->obj;
1137         BUG_ON(obj == NULL);
1138         kp->obj = NULL;
1139         count = atomic_inc_return(&zcache_curr_obj_count);
1140         if (count > zcache_curr_obj_count_max)
1141                 zcache_curr_obj_count_max = count;
1142         return obj;
1143 }
1144
1145 static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool)
1146 {
1147         atomic_dec(&zcache_curr_obj_count);
1148         BUG_ON(atomic_read(&zcache_curr_obj_count) < 0);
1149         kmem_cache_free(zcache_obj_cache, obj);
1150 }
1151
1152 static struct tmem_hostops zcache_hostops = {
1153         .obj_alloc = zcache_obj_alloc,
1154         .obj_free = zcache_obj_free,
1155         .objnode_alloc = zcache_objnode_alloc,
1156         .objnode_free = zcache_objnode_free,
1157 };
1158
1159 /*
1160  * zcache implementations for PAM page descriptor ops
1161  */
1162
1163 static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0);
1164 static unsigned long zcache_curr_eph_pampd_count_max;
1165 static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0);
1166 static unsigned long zcache_curr_pers_pampd_count_max;
1167
1168 /* forward reference */
1169 static int zcache_compress(struct page *from, void **out_va, unsigned *out_len);
1170
1171 static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph,
1172                                 struct tmem_pool *pool, struct tmem_oid *oid,
1173                                  uint32_t index)
1174 {
1175         void *pampd = NULL, *cdata;
1176         unsigned clen;
1177         int ret;
1178         unsigned long count;
1179         struct page *page = (struct page *)(data);
1180         struct zcache_client *cli = pool->client;
1181         uint16_t client_id = get_client_id_from_client(cli);
1182         unsigned long zv_mean_zsize;
1183         unsigned long curr_pers_pampd_count;
1184         u64 total_zsize;
1185
1186         if (eph) {
1187                 ret = zcache_compress(page, &cdata, &clen);
1188                 if (ret == 0)
1189                         goto out;
1190                 if (clen == 0 || clen > zbud_max_buddy_size()) {
1191                         zcache_compress_poor++;
1192                         goto out;
1193                 }
1194                 pampd = (void *)zbud_create(client_id, pool->pool_id, oid,
1195                                                 index, page, cdata, clen);
1196                 if (pampd != NULL) {
1197                         count = atomic_inc_return(&zcache_curr_eph_pampd_count);
1198                         if (count > zcache_curr_eph_pampd_count_max)
1199                                 zcache_curr_eph_pampd_count_max = count;
1200                 }
1201         } else {
1202                 curr_pers_pampd_count =
1203                         atomic_read(&zcache_curr_pers_pampd_count);
1204                 if (curr_pers_pampd_count >
1205                     (zv_page_count_policy_percent * totalram_pages) / 100)
1206                         goto out;
1207                 ret = zcache_compress(page, &cdata, &clen);
1208                 if (ret == 0)
1209                         goto out;
1210                 /* reject if compression is too poor */
1211                 if (clen > zv_max_zsize) {
1212                         zcache_compress_poor++;
1213                         goto out;
1214                 }
1215                 /* reject if mean compression is too poor */
1216                 if ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) {
1217                         total_zsize = zs_get_total_size_bytes(cli->zspool);
1218                         zv_mean_zsize = div_u64(total_zsize,
1219                                                 curr_pers_pampd_count);
1220                         if (zv_mean_zsize > zv_max_mean_zsize) {
1221                                 zcache_mean_compress_poor++;
1222                                 goto out;
1223                         }
1224                 }
1225                 pampd = (void *)zv_create(cli->zspool, pool->pool_id,
1226                                                 oid, index, cdata, clen);
1227                 if (pampd == NULL)
1228                         goto out;
1229                 count = atomic_inc_return(&zcache_curr_pers_pampd_count);
1230                 if (count > zcache_curr_pers_pampd_count_max)
1231                         zcache_curr_pers_pampd_count_max = count;
1232         }
1233 out:
1234         return pampd;
1235 }
1236
1237 /*
1238  * fill the pageframe corresponding to the struct page with the data
1239  * from the passed pampd
1240  */
1241 static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw,
1242                                         void *pampd, struct tmem_pool *pool,
1243                                         struct tmem_oid *oid, uint32_t index)
1244 {
1245         int ret = 0;
1246
1247         BUG_ON(is_ephemeral(pool));
1248         zv_decompress((struct page *)(data), (unsigned long)pampd);
1249         return ret;
1250 }
1251
1252 /*
1253  * fill the pageframe corresponding to the struct page with the data
1254  * from the passed pampd
1255  */
1256 static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw,
1257                                         void *pampd, struct tmem_pool *pool,
1258                                         struct tmem_oid *oid, uint32_t index)
1259 {
1260         int ret = 0;
1261
1262         BUG_ON(!is_ephemeral(pool));
1263         zbud_decompress((struct page *)(data), pampd);
1264         zbud_free_and_delist((struct zbud_hdr *)pampd);
1265         atomic_dec(&zcache_curr_eph_pampd_count);
1266         return ret;
1267 }
1268
1269 /*
1270  * free the pampd and remove it from any zcache lists
1271  * pampd must no longer be pointed to from any tmem data structures!
1272  */
1273 static void zcache_pampd_free(void *pampd, struct tmem_pool *pool,
1274                                 struct tmem_oid *oid, uint32_t index)
1275 {
1276         struct zcache_client *cli = pool->client;
1277
1278         if (is_ephemeral(pool)) {
1279                 zbud_free_and_delist((struct zbud_hdr *)pampd);
1280                 atomic_dec(&zcache_curr_eph_pampd_count);
1281                 BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0);
1282         } else {
1283                 zv_free(cli->zspool, (unsigned long)pampd);
1284                 atomic_dec(&zcache_curr_pers_pampd_count);
1285                 BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0);
1286         }
1287 }
1288
1289 static void zcache_pampd_free_obj(struct tmem_pool *pool, struct tmem_obj *obj)
1290 {
1291 }
1292
1293 static void zcache_pampd_new_obj(struct tmem_obj *obj)
1294 {
1295 }
1296
1297 static int zcache_pampd_replace_in_obj(void *pampd, struct tmem_obj *obj)
1298 {
1299         return -1;
1300 }
1301
1302 static bool zcache_pampd_is_remote(void *pampd)
1303 {
1304         return 0;
1305 }
1306
1307 static struct tmem_pamops zcache_pamops = {
1308         .create = zcache_pampd_create,
1309         .get_data = zcache_pampd_get_data,
1310         .get_data_and_free = zcache_pampd_get_data_and_free,
1311         .free = zcache_pampd_free,
1312         .free_obj = zcache_pampd_free_obj,
1313         .new_obj = zcache_pampd_new_obj,
1314         .replace_in_obj = zcache_pampd_replace_in_obj,
1315         .is_remote = zcache_pampd_is_remote,
1316 };
1317
1318 /*
1319  * zcache compression/decompression and related per-cpu stuff
1320  */
1321
1322 static DEFINE_PER_CPU(unsigned char *, zcache_dstmem);
1323 #define ZCACHE_DSTMEM_ORDER 1
1324
1325 static int zcache_compress(struct page *from, void **out_va, unsigned *out_len)
1326 {
1327         int ret = 0;
1328         unsigned char *dmem = __get_cpu_var(zcache_dstmem);
1329         char *from_va;
1330
1331         BUG_ON(!irqs_disabled());
1332         if (unlikely(dmem == NULL))
1333                 goto out;  /* no buffer or no compressor so can't compress */
1334         *out_len = PAGE_SIZE << ZCACHE_DSTMEM_ORDER;
1335         from_va = kmap_atomic(from);
1336         mb();
1337         ret = zcache_comp_op(ZCACHE_COMPOP_COMPRESS, from_va, PAGE_SIZE, dmem,
1338                                 out_len);
1339         BUG_ON(ret);
1340         *out_va = dmem;
1341         kunmap_atomic(from_va);
1342         ret = 1;
1343 out:
1344         return ret;
1345 }
1346
1347 static int zcache_comp_cpu_up(int cpu)
1348 {
1349         struct crypto_comp *tfm;
1350
1351         tfm = crypto_alloc_comp(zcache_comp_name, 0, 0);
1352         if (IS_ERR(tfm))
1353                 return NOTIFY_BAD;
1354         *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = tfm;
1355         return NOTIFY_OK;
1356 }
1357
1358 static void zcache_comp_cpu_down(int cpu)
1359 {
1360         struct crypto_comp *tfm;
1361
1362         tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu);
1363         crypto_free_comp(tfm);
1364         *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL;
1365 }
1366
1367 static int zcache_cpu_notifier(struct notifier_block *nb,
1368                                 unsigned long action, void *pcpu)
1369 {
1370         int ret, cpu = (long)pcpu;
1371         struct zcache_preload *kp;
1372
1373         switch (action) {
1374         case CPU_UP_PREPARE:
1375                 ret = zcache_comp_cpu_up(cpu);
1376                 if (ret != NOTIFY_OK) {
1377                         pr_err("zcache: can't allocate compressor transform\n");
1378                         return ret;
1379                 }
1380                 per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages(
1381                         GFP_KERNEL | __GFP_REPEAT, ZCACHE_DSTMEM_ORDER);
1382                 break;
1383         case CPU_DEAD:
1384         case CPU_UP_CANCELED:
1385                 zcache_comp_cpu_down(cpu);
1386                 free_pages((unsigned long)per_cpu(zcache_dstmem, cpu),
1387                         ZCACHE_DSTMEM_ORDER);
1388                 per_cpu(zcache_dstmem, cpu) = NULL;
1389                 kp = &per_cpu(zcache_preloads, cpu);
1390                 while (kp->nr) {
1391                         kmem_cache_free(zcache_objnode_cache,
1392                                         kp->objnodes[kp->nr - 1]);
1393                         kp->objnodes[kp->nr - 1] = NULL;
1394                         kp->nr--;
1395                 }
1396                 if (kp->obj) {
1397                         kmem_cache_free(zcache_obj_cache, kp->obj);
1398                         kp->obj = NULL;
1399                 }
1400                 if (kp->page) {
1401                         free_page((unsigned long)kp->page);
1402                         kp->page = NULL;
1403                 }
1404                 break;
1405         default:
1406                 break;
1407         }
1408         return NOTIFY_OK;
1409 }
1410
1411 static struct notifier_block zcache_cpu_notifier_block = {
1412         .notifier_call = zcache_cpu_notifier
1413 };
1414
1415 #ifdef CONFIG_SYSFS
1416 #define ZCACHE_SYSFS_RO(_name) \
1417         static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1418                                 struct kobj_attribute *attr, char *buf) \
1419         { \
1420                 return sprintf(buf, "%lu\n", zcache_##_name); \
1421         } \
1422         static struct kobj_attribute zcache_##_name##_attr = { \
1423                 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1424                 .show = zcache_##_name##_show, \
1425         }
1426
1427 #define ZCACHE_SYSFS_RO_ATOMIC(_name) \
1428         static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1429                                 struct kobj_attribute *attr, char *buf) \
1430         { \
1431             return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
1432         } \
1433         static struct kobj_attribute zcache_##_name##_attr = { \
1434                 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1435                 .show = zcache_##_name##_show, \
1436         }
1437
1438 #define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
1439         static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1440                                 struct kobj_attribute *attr, char *buf) \
1441         { \
1442             return _func(buf); \
1443         } \
1444         static struct kobj_attribute zcache_##_name##_attr = { \
1445                 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1446                 .show = zcache_##_name##_show, \
1447         }
1448
1449 ZCACHE_SYSFS_RO(curr_obj_count_max);
1450 ZCACHE_SYSFS_RO(curr_objnode_count_max);
1451 ZCACHE_SYSFS_RO(flush_total);
1452 ZCACHE_SYSFS_RO(flush_found);
1453 ZCACHE_SYSFS_RO(flobj_total);
1454 ZCACHE_SYSFS_RO(flobj_found);
1455 ZCACHE_SYSFS_RO(failed_eph_puts);
1456 ZCACHE_SYSFS_RO(failed_pers_puts);
1457 ZCACHE_SYSFS_RO(zbud_curr_zbytes);
1458 ZCACHE_SYSFS_RO(zbud_cumul_zpages);
1459 ZCACHE_SYSFS_RO(zbud_cumul_zbytes);
1460 ZCACHE_SYSFS_RO(zbud_buddied_count);
1461 ZCACHE_SYSFS_RO(zbpg_unused_list_count);
1462 ZCACHE_SYSFS_RO(evicted_raw_pages);
1463 ZCACHE_SYSFS_RO(evicted_unbuddied_pages);
1464 ZCACHE_SYSFS_RO(evicted_buddied_pages);
1465 ZCACHE_SYSFS_RO(failed_get_free_pages);
1466 ZCACHE_SYSFS_RO(failed_alloc);
1467 ZCACHE_SYSFS_RO(put_to_flush);
1468 ZCACHE_SYSFS_RO(compress_poor);
1469 ZCACHE_SYSFS_RO(mean_compress_poor);
1470 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages);
1471 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages);
1472 ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count);
1473 ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count);
1474 ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts,
1475                         zbud_show_unbuddied_list_counts);
1476 ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts,
1477                         zbud_show_cumul_chunk_counts);
1478 ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts,
1479                         zv_curr_dist_counts_show);
1480 ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts,
1481                         zv_cumul_dist_counts_show);
1482
1483 static struct attribute *zcache_attrs[] = {
1484         &zcache_curr_obj_count_attr.attr,
1485         &zcache_curr_obj_count_max_attr.attr,
1486         &zcache_curr_objnode_count_attr.attr,
1487         &zcache_curr_objnode_count_max_attr.attr,
1488         &zcache_flush_total_attr.attr,
1489         &zcache_flobj_total_attr.attr,
1490         &zcache_flush_found_attr.attr,
1491         &zcache_flobj_found_attr.attr,
1492         &zcache_failed_eph_puts_attr.attr,
1493         &zcache_failed_pers_puts_attr.attr,
1494         &zcache_compress_poor_attr.attr,
1495         &zcache_mean_compress_poor_attr.attr,
1496         &zcache_zbud_curr_raw_pages_attr.attr,
1497         &zcache_zbud_curr_zpages_attr.attr,
1498         &zcache_zbud_curr_zbytes_attr.attr,
1499         &zcache_zbud_cumul_zpages_attr.attr,
1500         &zcache_zbud_cumul_zbytes_attr.attr,
1501         &zcache_zbud_buddied_count_attr.attr,
1502         &zcache_zbpg_unused_list_count_attr.attr,
1503         &zcache_evicted_raw_pages_attr.attr,
1504         &zcache_evicted_unbuddied_pages_attr.attr,
1505         &zcache_evicted_buddied_pages_attr.attr,
1506         &zcache_failed_get_free_pages_attr.attr,
1507         &zcache_failed_alloc_attr.attr,
1508         &zcache_put_to_flush_attr.attr,
1509         &zcache_zbud_unbuddied_list_counts_attr.attr,
1510         &zcache_zbud_cumul_chunk_counts_attr.attr,
1511         &zcache_zv_curr_dist_counts_attr.attr,
1512         &zcache_zv_cumul_dist_counts_attr.attr,
1513         &zcache_zv_max_zsize_attr.attr,
1514         &zcache_zv_max_mean_zsize_attr.attr,
1515         &zcache_zv_page_count_policy_percent_attr.attr,
1516         NULL,
1517 };
1518
1519 static struct attribute_group zcache_attr_group = {
1520         .attrs = zcache_attrs,
1521         .name = "zcache",
1522 };
1523
1524 #endif /* CONFIG_SYSFS */
1525 /*
1526  * When zcache is disabled ("frozen"), pools can be created and destroyed,
1527  * but all puts (and thus all other operations that require memory allocation)
1528  * must fail.  If zcache is unfrozen, accepts puts, then frozen again,
1529  * data consistency requires all puts while frozen to be converted into
1530  * flushes.
1531  */
1532 static bool zcache_freeze;
1533
1534 /*
1535  * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
1536  */
1537 static int shrink_zcache_memory(struct shrinker *shrink,
1538                                 struct shrink_control *sc)
1539 {
1540         int ret = -1;
1541         int nr = sc->nr_to_scan;
1542         gfp_t gfp_mask = sc->gfp_mask;
1543
1544         if (nr >= 0) {
1545                 if (!(gfp_mask & __GFP_FS))
1546                         /* does this case really need to be skipped? */
1547                         goto out;
1548                 zbud_evict_pages(nr);
1549         }
1550         ret = (int)atomic_read(&zcache_zbud_curr_raw_pages);
1551 out:
1552         return ret;
1553 }
1554
1555 static struct shrinker zcache_shrinker = {
1556         .shrink = shrink_zcache_memory,
1557         .seeks = DEFAULT_SEEKS,
1558 };
1559
1560 /*
1561  * zcache shims between cleancache/frontswap ops and tmem
1562  */
1563
1564 static int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp,
1565                                 uint32_t index, struct page *page)
1566 {
1567         struct tmem_pool *pool;
1568         int ret = -1;
1569
1570         BUG_ON(!irqs_disabled());
1571         pool = zcache_get_pool_by_id(cli_id, pool_id);
1572         if (unlikely(pool == NULL))
1573                 goto out;
1574         if (!zcache_freeze && zcache_do_preload(pool) == 0) {
1575                 /* preload does preempt_disable on success */
1576                 ret = tmem_put(pool, oidp, index, (char *)(page),
1577                                 PAGE_SIZE, 0, is_ephemeral(pool));
1578                 if (ret < 0) {
1579                         if (is_ephemeral(pool))
1580                                 zcache_failed_eph_puts++;
1581                         else
1582                                 zcache_failed_pers_puts++;
1583                 }
1584                 zcache_put_pool(pool);
1585                 preempt_enable_no_resched();
1586         } else {
1587                 zcache_put_to_flush++;
1588                 if (atomic_read(&pool->obj_count) > 0)
1589                         /* the put fails whether the flush succeeds or not */
1590                         (void)tmem_flush_page(pool, oidp, index);
1591                 zcache_put_pool(pool);
1592         }
1593 out:
1594         return ret;
1595 }
1596
1597 static int zcache_get_page(int cli_id, int pool_id, struct tmem_oid *oidp,
1598                                 uint32_t index, struct page *page)
1599 {
1600         struct tmem_pool *pool;
1601         int ret = -1;
1602         unsigned long flags;
1603         size_t size = PAGE_SIZE;
1604
1605         local_irq_save(flags);
1606         pool = zcache_get_pool_by_id(cli_id, pool_id);
1607         if (likely(pool != NULL)) {
1608                 if (atomic_read(&pool->obj_count) > 0)
1609                         ret = tmem_get(pool, oidp, index, (char *)(page),
1610                                         &size, 0, is_ephemeral(pool));
1611                 zcache_put_pool(pool);
1612         }
1613         local_irq_restore(flags);
1614         return ret;
1615 }
1616
1617 static int zcache_flush_page(int cli_id, int pool_id,
1618                                 struct tmem_oid *oidp, uint32_t index)
1619 {
1620         struct tmem_pool *pool;
1621         int ret = -1;
1622         unsigned long flags;
1623
1624         local_irq_save(flags);
1625         zcache_flush_total++;
1626         pool = zcache_get_pool_by_id(cli_id, pool_id);
1627         if (likely(pool != NULL)) {
1628                 if (atomic_read(&pool->obj_count) > 0)
1629                         ret = tmem_flush_page(pool, oidp, index);
1630                 zcache_put_pool(pool);
1631         }
1632         if (ret >= 0)
1633                 zcache_flush_found++;
1634         local_irq_restore(flags);
1635         return ret;
1636 }
1637
1638 static int zcache_flush_object(int cli_id, int pool_id,
1639                                 struct tmem_oid *oidp)
1640 {
1641         struct tmem_pool *pool;
1642         int ret = -1;
1643         unsigned long flags;
1644
1645         local_irq_save(flags);
1646         zcache_flobj_total++;
1647         pool = zcache_get_pool_by_id(cli_id, pool_id);
1648         if (likely(pool != NULL)) {
1649                 if (atomic_read(&pool->obj_count) > 0)
1650                         ret = tmem_flush_object(pool, oidp);
1651                 zcache_put_pool(pool);
1652         }
1653         if (ret >= 0)
1654                 zcache_flobj_found++;
1655         local_irq_restore(flags);
1656         return ret;
1657 }
1658
1659 static int zcache_destroy_pool(int cli_id, int pool_id)
1660 {
1661         struct tmem_pool *pool = NULL;
1662         struct zcache_client *cli = NULL;
1663         int ret = -1;
1664
1665         if (pool_id < 0)
1666                 goto out;
1667         if (cli_id == LOCAL_CLIENT)
1668                 cli = &zcache_host;
1669         else if ((unsigned int)cli_id < MAX_CLIENTS)
1670                 cli = &zcache_clients[cli_id];
1671         if (cli == NULL)
1672                 goto out;
1673         atomic_inc(&cli->refcount);
1674         pool = idr_find(&cli->tmem_pools, pool_id);
1675         if (pool == NULL)
1676                 goto out;
1677         idr_remove(&cli->tmem_pools, pool_id);
1678         /* wait for pool activity on other cpus to quiesce */
1679         while (atomic_read(&pool->refcount) != 0)
1680                 ;
1681         atomic_dec(&cli->refcount);
1682         local_bh_disable();
1683         ret = tmem_destroy_pool(pool);
1684         local_bh_enable();
1685         kfree(pool);
1686         pr_info("zcache: destroyed pool id=%d, cli_id=%d\n",
1687                         pool_id, cli_id);
1688 out:
1689         return ret;
1690 }
1691
1692 static int zcache_new_pool(uint16_t cli_id, uint32_t flags)
1693 {
1694         int poolid = -1;
1695         struct tmem_pool *pool;
1696         struct zcache_client *cli = NULL;
1697         int r;
1698
1699         if (cli_id == LOCAL_CLIENT)
1700                 cli = &zcache_host;
1701         else if ((unsigned int)cli_id < MAX_CLIENTS)
1702                 cli = &zcache_clients[cli_id];
1703         if (cli == NULL)
1704                 goto out;
1705         atomic_inc(&cli->refcount);
1706         pool = kmalloc(sizeof(struct tmem_pool), GFP_ATOMIC);
1707         if (pool == NULL) {
1708                 pr_info("zcache: pool creation failed: out of memory\n");
1709                 goto out;
1710         }
1711
1712         do {
1713                 r = idr_pre_get(&cli->tmem_pools, GFP_ATOMIC);
1714                 if (r != 1) {
1715                         kfree(pool);
1716                         pr_info("zcache: pool creation failed: out of memory\n");
1717                         goto out;
1718                 }
1719                 r = idr_get_new(&cli->tmem_pools, pool, &poolid);
1720         } while (r == -EAGAIN);
1721         if (r) {
1722                 pr_info("zcache: pool creation failed: error %d\n", r);
1723                 kfree(pool);
1724                 goto out;
1725         }
1726
1727         atomic_set(&pool->refcount, 0);
1728         pool->client = cli;
1729         pool->pool_id = poolid;
1730         tmem_new_pool(pool, flags);
1731         pr_info("zcache: created %s tmem pool, id=%d, client=%d\n",
1732                 flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
1733                 poolid, cli_id);
1734 out:
1735         if (cli != NULL)
1736                 atomic_dec(&cli->refcount);
1737         return poolid;
1738 }
1739
1740 /**********
1741  * Two kernel functionalities currently can be layered on top of tmem.
1742  * These are "cleancache" which is used as a second-chance cache for clean
1743  * page cache pages; and "frontswap" which is used for swap pages
1744  * to avoid writes to disk.  A generic "shim" is provided here for each
1745  * to translate in-kernel semantics to zcache semantics.
1746  */
1747
1748 #ifdef CONFIG_CLEANCACHE
1749 static void zcache_cleancache_put_page(int pool_id,
1750                                         struct cleancache_filekey key,
1751                                         pgoff_t index, struct page *page)
1752 {
1753         u32 ind = (u32) index;
1754         struct tmem_oid oid = *(struct tmem_oid *)&key;
1755
1756         if (likely(ind == index))
1757                 (void)zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index, page);
1758 }
1759
1760 static int zcache_cleancache_get_page(int pool_id,
1761                                         struct cleancache_filekey key,
1762                                         pgoff_t index, struct page *page)
1763 {
1764         u32 ind = (u32) index;
1765         struct tmem_oid oid = *(struct tmem_oid *)&key;
1766         int ret = -1;
1767
1768         if (likely(ind == index))
1769                 ret = zcache_get_page(LOCAL_CLIENT, pool_id, &oid, index, page);
1770         return ret;
1771 }
1772
1773 static void zcache_cleancache_flush_page(int pool_id,
1774                                         struct cleancache_filekey key,
1775                                         pgoff_t index)
1776 {
1777         u32 ind = (u32) index;
1778         struct tmem_oid oid = *(struct tmem_oid *)&key;
1779
1780         if (likely(ind == index))
1781                 (void)zcache_flush_page(LOCAL_CLIENT, pool_id, &oid, ind);
1782 }
1783
1784 static void zcache_cleancache_flush_inode(int pool_id,
1785                                         struct cleancache_filekey key)
1786 {
1787         struct tmem_oid oid = *(struct tmem_oid *)&key;
1788
1789         (void)zcache_flush_object(LOCAL_CLIENT, pool_id, &oid);
1790 }
1791
1792 static void zcache_cleancache_flush_fs(int pool_id)
1793 {
1794         if (pool_id >= 0)
1795                 (void)zcache_destroy_pool(LOCAL_CLIENT, pool_id);
1796 }
1797
1798 static int zcache_cleancache_init_fs(size_t pagesize)
1799 {
1800         BUG_ON(sizeof(struct cleancache_filekey) !=
1801                                 sizeof(struct tmem_oid));
1802         BUG_ON(pagesize != PAGE_SIZE);
1803         return zcache_new_pool(LOCAL_CLIENT, 0);
1804 }
1805
1806 static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
1807 {
1808         /* shared pools are unsupported and map to private */
1809         BUG_ON(sizeof(struct cleancache_filekey) !=
1810                                 sizeof(struct tmem_oid));
1811         BUG_ON(pagesize != PAGE_SIZE);
1812         return zcache_new_pool(LOCAL_CLIENT, 0);
1813 }
1814
1815 static struct cleancache_ops zcache_cleancache_ops = {
1816         .put_page = zcache_cleancache_put_page,
1817         .get_page = zcache_cleancache_get_page,
1818         .invalidate_page = zcache_cleancache_flush_page,
1819         .invalidate_inode = zcache_cleancache_flush_inode,
1820         .invalidate_fs = zcache_cleancache_flush_fs,
1821         .init_shared_fs = zcache_cleancache_init_shared_fs,
1822         .init_fs = zcache_cleancache_init_fs
1823 };
1824
1825 struct cleancache_ops zcache_cleancache_register_ops(void)
1826 {
1827         struct cleancache_ops old_ops =
1828                 cleancache_register_ops(&zcache_cleancache_ops);
1829
1830         return old_ops;
1831 }
1832 #endif
1833
1834 #ifdef CONFIG_FRONTSWAP
1835 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1836 static int zcache_frontswap_poolid = -1;
1837
1838 /*
1839  * Swizzling increases objects per swaptype, increasing tmem concurrency
1840  * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
1841  * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from
1842  * frontswap_get_page(), but has side-effects. Hence using 8.
1843  */
1844 #define SWIZ_BITS               8
1845 #define SWIZ_MASK               ((1 << SWIZ_BITS) - 1)
1846 #define _oswiz(_type, _ind)     ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
1847 #define iswiz(_ind)             (_ind >> SWIZ_BITS)
1848
1849 static inline struct tmem_oid oswiz(unsigned type, u32 ind)
1850 {
1851         struct tmem_oid oid = { .oid = { 0 } };
1852         oid.oid[0] = _oswiz(type, ind);
1853         return oid;
1854 }
1855
1856 static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
1857                                    struct page *page)
1858 {
1859         u64 ind64 = (u64)offset;
1860         u32 ind = (u32)offset;
1861         struct tmem_oid oid = oswiz(type, ind);
1862         int ret = -1;
1863         unsigned long flags;
1864
1865         BUG_ON(!PageLocked(page));
1866         if (likely(ind64 == ind)) {
1867                 local_irq_save(flags);
1868                 ret = zcache_put_page(LOCAL_CLIENT, zcache_frontswap_poolid,
1869                                         &oid, iswiz(ind), page);
1870                 local_irq_restore(flags);
1871         }
1872         return ret;
1873 }
1874
1875 /* returns 0 if the page was successfully gotten from frontswap, -1 if
1876  * was not present (should never happen!) */
1877 static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,
1878                                    struct page *page)
1879 {
1880         u64 ind64 = (u64)offset;
1881         u32 ind = (u32)offset;
1882         struct tmem_oid oid = oswiz(type, ind);
1883         int ret = -1;
1884
1885         BUG_ON(!PageLocked(page));
1886         if (likely(ind64 == ind))
1887                 ret = zcache_get_page(LOCAL_CLIENT, zcache_frontswap_poolid,
1888                                         &oid, iswiz(ind), page);
1889         return ret;
1890 }
1891
1892 /* flush a single page from frontswap */
1893 static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset)
1894 {
1895         u64 ind64 = (u64)offset;
1896         u32 ind = (u32)offset;
1897         struct tmem_oid oid = oswiz(type, ind);
1898
1899         if (likely(ind64 == ind))
1900                 (void)zcache_flush_page(LOCAL_CLIENT, zcache_frontswap_poolid,
1901                                         &oid, iswiz(ind));
1902 }
1903
1904 /* flush all pages from the passed swaptype */
1905 static void zcache_frontswap_flush_area(unsigned type)
1906 {
1907         struct tmem_oid oid;
1908         int ind;
1909
1910         for (ind = SWIZ_MASK; ind >= 0; ind--) {
1911                 oid = oswiz(type, ind);
1912                 (void)zcache_flush_object(LOCAL_CLIENT,
1913                                                 zcache_frontswap_poolid, &oid);
1914         }
1915 }
1916
1917 static void zcache_frontswap_init(unsigned ignored)
1918 {
1919         /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1920         if (zcache_frontswap_poolid < 0)
1921                 zcache_frontswap_poolid =
1922                         zcache_new_pool(LOCAL_CLIENT, TMEM_POOL_PERSIST);
1923 }
1924
1925 static struct frontswap_ops zcache_frontswap_ops = {
1926         .put_page = zcache_frontswap_put_page,
1927         .get_page = zcache_frontswap_get_page,
1928         .invalidate_page = zcache_frontswap_flush_page,
1929         .invalidate_area = zcache_frontswap_flush_area,
1930         .init = zcache_frontswap_init
1931 };
1932
1933 struct frontswap_ops zcache_frontswap_register_ops(void)
1934 {
1935         struct frontswap_ops old_ops =
1936                 frontswap_register_ops(&zcache_frontswap_ops);
1937
1938         return old_ops;
1939 }
1940 #endif
1941
1942 /*
1943  * zcache initialization
1944  * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
1945  * NOTHING HAPPENS!
1946  */
1947
1948 static int zcache_enabled;
1949
1950 static int __init enable_zcache(char *s)
1951 {
1952         zcache_enabled = 1;
1953         return 1;
1954 }
1955 __setup("zcache", enable_zcache);
1956
1957 /* allow independent dynamic disabling of cleancache and frontswap */
1958
1959 static int use_cleancache = 1;
1960
1961 static int __init no_cleancache(char *s)
1962 {
1963         use_cleancache = 0;
1964         return 1;
1965 }
1966
1967 __setup("nocleancache", no_cleancache);
1968
1969 static int use_frontswap = 1;
1970
1971 static int __init no_frontswap(char *s)
1972 {
1973         use_frontswap = 0;
1974         return 1;
1975 }
1976
1977 __setup("nofrontswap", no_frontswap);
1978
1979 static int __init enable_zcache_compressor(char *s)
1980 {
1981         strncpy(zcache_comp_name, s, ZCACHE_COMP_NAME_SZ);
1982         zcache_enabled = 1;
1983         return 1;
1984 }
1985 __setup("zcache=", enable_zcache_compressor);
1986
1987
1988 static int zcache_comp_init(void)
1989 {
1990         int ret = 0;
1991
1992         /* check crypto algorithm */
1993         if (*zcache_comp_name != '\0') {
1994                 ret = crypto_has_comp(zcache_comp_name, 0, 0);
1995                 if (!ret)
1996                         pr_info("zcache: %s not supported\n",
1997                                         zcache_comp_name);
1998         }
1999         if (!ret)
2000                 strcpy(zcache_comp_name, "lzo");
2001         ret = crypto_has_comp(zcache_comp_name, 0, 0);
2002         if (!ret) {
2003                 ret = 1;
2004                 goto out;
2005         }
2006         pr_info("zcache: using %s compressor\n", zcache_comp_name);
2007
2008         /* alloc percpu transforms */
2009         ret = 0;
2010         zcache_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
2011         if (!zcache_comp_pcpu_tfms)
2012                 ret = 1;
2013 out:
2014         return ret;
2015 }
2016
2017 static int __init zcache_init(void)
2018 {
2019         int ret = 0;
2020
2021 #ifdef CONFIG_SYSFS
2022         ret = sysfs_create_group(mm_kobj, &zcache_attr_group);
2023         if (ret) {
2024                 pr_err("zcache: can't create sysfs\n");
2025                 goto out;
2026         }
2027 #endif /* CONFIG_SYSFS */
2028 #if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)
2029         if (zcache_enabled) {
2030                 unsigned int cpu;
2031
2032                 tmem_register_hostops(&zcache_hostops);
2033                 tmem_register_pamops(&zcache_pamops);
2034                 ret = register_cpu_notifier(&zcache_cpu_notifier_block);
2035                 if (ret) {
2036                         pr_err("zcache: can't register cpu notifier\n");
2037                         goto out;
2038                 }
2039                 ret = zcache_comp_init();
2040                 if (ret) {
2041                         pr_err("zcache: compressor initialization failed\n");
2042                         goto out;
2043                 }
2044                 for_each_online_cpu(cpu) {
2045                         void *pcpu = (void *)(long)cpu;
2046                         zcache_cpu_notifier(&zcache_cpu_notifier_block,
2047                                 CPU_UP_PREPARE, pcpu);
2048                 }
2049         }
2050         zcache_objnode_cache = kmem_cache_create("zcache_objnode",
2051                                 sizeof(struct tmem_objnode), 0, 0, NULL);
2052         zcache_obj_cache = kmem_cache_create("zcache_obj",
2053                                 sizeof(struct tmem_obj), 0, 0, NULL);
2054         ret = zcache_new_client(LOCAL_CLIENT);
2055         if (ret) {
2056                 pr_err("zcache: can't create client\n");
2057                 goto out;
2058         }
2059 #endif
2060 #ifdef CONFIG_CLEANCACHE
2061         if (zcache_enabled && use_cleancache) {
2062                 struct cleancache_ops old_ops;
2063
2064                 zbud_init();
2065                 register_shrinker(&zcache_shrinker);
2066                 old_ops = zcache_cleancache_register_ops();
2067                 pr_info("zcache: cleancache enabled using kernel "
2068                         "transcendent memory and compression buddies\n");
2069                 if (old_ops.init_fs != NULL)
2070                         pr_warning("zcache: cleancache_ops overridden");
2071         }
2072 #endif
2073 #ifdef CONFIG_FRONTSWAP
2074         if (zcache_enabled && use_frontswap) {
2075                 struct frontswap_ops old_ops;
2076
2077                 old_ops = zcache_frontswap_register_ops();
2078                 pr_info("zcache: frontswap enabled using kernel "
2079                         "transcendent memory and zsmalloc\n");
2080                 if (old_ops.init != NULL)
2081                         pr_warning("zcache: frontswap_ops overridden");
2082         }
2083 #endif
2084 out:
2085         return ret;
2086 }
2087
2088 module_init(zcache_init)