]> Pileus Git - ~andy/linux/blob - drivers/block/rbd.c
libceph: let osd ops determine request data length
[~andy/linux] / drivers / block / rbd.c
1 /*
2    rbd.c -- Export ceph rados objects as a Linux block device
3
4
5    based on drivers/block/osdblk.c:
6
7    Copyright 2009 Red Hat, Inc.
8
9    This program is free software; you can redistribute it and/or modify
10    it under the terms of the GNU General Public License as published by
11    the Free Software Foundation.
12
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17
18    You should have received a copy of the GNU General Public License
19    along with this program; see the file COPYING.  If not, write to
20    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
24    For usage instructions, please refer to:
25
26                  Documentation/ABI/testing/sysfs-bus-rbd
27
28  */
29
30 #include <linux/ceph/libceph.h>
31 #include <linux/ceph/osd_client.h>
32 #include <linux/ceph/mon_client.h>
33 #include <linux/ceph/decode.h>
34 #include <linux/parser.h>
35
36 #include <linux/kernel.h>
37 #include <linux/device.h>
38 #include <linux/module.h>
39 #include <linux/fs.h>
40 #include <linux/blkdev.h>
41
42 #include "rbd_types.h"
43
44 #define RBD_DEBUG       /* Activate rbd_assert() calls */
45
46 /*
47  * The basic unit of block I/O is a sector.  It is interpreted in a
48  * number of contexts in Linux (blk, bio, genhd), but the default is
49  * universally 512 bytes.  These symbols are just slightly more
50  * meaningful than the bare numbers they represent.
51  */
52 #define SECTOR_SHIFT    9
53 #define SECTOR_SIZE     (1ULL << SECTOR_SHIFT)
54
55 /* It might be useful to have these defined elsewhere */
56
57 #define U8_MAX  ((u8)   (~0U))
58 #define U16_MAX ((u16)  (~0U))
59 #define U32_MAX ((u32)  (~0U))
60 #define U64_MAX ((u64)  (~0ULL))
61
62 #define RBD_DRV_NAME "rbd"
63 #define RBD_DRV_NAME_LONG "rbd (rados block device)"
64
65 #define RBD_MINORS_PER_MAJOR    256             /* max minors per blkdev */
66
67 #define RBD_SNAP_DEV_NAME_PREFIX        "snap_"
68 #define RBD_MAX_SNAP_NAME_LEN   \
69                         (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70
71 #define RBD_MAX_SNAP_COUNT      510     /* allows max snapc to fit in 4KB */
72
73 #define RBD_SNAP_HEAD_NAME      "-"
74
75 /* This allows a single page to hold an image name sent by OSD */
76 #define RBD_IMAGE_NAME_LEN_MAX  (PAGE_SIZE - sizeof (__le32) - 1)
77 #define RBD_IMAGE_ID_LEN_MAX    64
78
79 #define RBD_OBJ_PREFIX_LEN_MAX  64
80
81 /* Feature bits */
82
83 #define RBD_FEATURE_LAYERING      1
84
85 /* Features supported by this (client software) implementation. */
86
87 #define RBD_FEATURES_ALL          (0)
88
89 /*
90  * An RBD device name will be "rbd#", where the "rbd" comes from
91  * RBD_DRV_NAME above, and # is a unique integer identifier.
92  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
93  * enough to hold all possible device names.
94  */
95 #define DEV_NAME_LEN            32
96 #define MAX_INT_FORMAT_WIDTH    ((5 * sizeof (int)) / 2 + 1)
97
98 /*
99  * block device image metadata (in-memory version)
100  */
101 struct rbd_image_header {
102         /* These four fields never change for a given rbd image */
103         char *object_prefix;
104         u64 features;
105         __u8 obj_order;
106         __u8 crypt_type;
107         __u8 comp_type;
108
109         /* The remaining fields need to be updated occasionally */
110         u64 image_size;
111         struct ceph_snap_context *snapc;
112         char *snap_names;
113         u64 *snap_sizes;
114
115         u64 obj_version;
116 };
117
118 /*
119  * An rbd image specification.
120  *
121  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
122  * identify an image.  Each rbd_dev structure includes a pointer to
123  * an rbd_spec structure that encapsulates this identity.
124  *
125  * Each of the id's in an rbd_spec has an associated name.  For a
126  * user-mapped image, the names are supplied and the id's associated
127  * with them are looked up.  For a layered image, a parent image is
128  * defined by the tuple, and the names are looked up.
129  *
130  * An rbd_dev structure contains a parent_spec pointer which is
131  * non-null if the image it represents is a child in a layered
132  * image.  This pointer will refer to the rbd_spec structure used
133  * by the parent rbd_dev for its own identity (i.e., the structure
134  * is shared between the parent and child).
135  *
136  * Since these structures are populated once, during the discovery
137  * phase of image construction, they are effectively immutable so
138  * we make no effort to synchronize access to them.
139  *
140  * Note that code herein does not assume the image name is known (it
141  * could be a null pointer).
142  */
143 struct rbd_spec {
144         u64             pool_id;
145         char            *pool_name;
146
147         char            *image_id;
148         char            *image_name;
149
150         u64             snap_id;
151         char            *snap_name;
152
153         struct kref     kref;
154 };
155
156 /*
157  * an instance of the client.  multiple devices may share an rbd client.
158  */
159 struct rbd_client {
160         struct ceph_client      *client;
161         struct kref             kref;
162         struct list_head        node;
163 };
164
165 struct rbd_img_request;
166 typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
167
168 #define BAD_WHICH       U32_MAX         /* Good which or bad which, which? */
169
170 struct rbd_obj_request;
171 typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
172
173 enum obj_request_type {
174         OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
175 };
176
177 struct rbd_obj_request {
178         const char              *object_name;
179         u64                     offset;         /* object start byte */
180         u64                     length;         /* bytes from offset */
181
182         struct rbd_img_request  *img_request;
183         struct list_head        links;          /* img_request->obj_requests */
184         u32                     which;          /* posn image request list */
185
186         enum obj_request_type   type;
187         union {
188                 struct bio      *bio_list;
189                 struct {
190                         struct page     **pages;
191                         u32             page_count;
192                 };
193         };
194
195         struct ceph_osd_request *osd_req;
196
197         u64                     xferred;        /* bytes transferred */
198         u64                     version;
199         int                     result;
200         atomic_t                done;
201
202         rbd_obj_callback_t      callback;
203         struct completion       completion;
204
205         struct kref             kref;
206 };
207
208 struct rbd_img_request {
209         struct request          *rq;
210         struct rbd_device       *rbd_dev;
211         u64                     offset; /* starting image byte offset */
212         u64                     length; /* byte count from offset */
213         bool                    write_request;  /* false for read */
214         union {
215                 struct ceph_snap_context *snapc;        /* for writes */
216                 u64             snap_id;                /* for reads */
217         };
218         spinlock_t              completion_lock;/* protects next_completion */
219         u32                     next_completion;
220         rbd_img_callback_t      callback;
221
222         u32                     obj_request_count;
223         struct list_head        obj_requests;   /* rbd_obj_request structs */
224
225         struct kref             kref;
226 };
227
228 #define for_each_obj_request(ireq, oreq) \
229         list_for_each_entry(oreq, &(ireq)->obj_requests, links)
230 #define for_each_obj_request_from(ireq, oreq) \
231         list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
232 #define for_each_obj_request_safe(ireq, oreq, n) \
233         list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
234
235 struct rbd_snap {
236         struct  device          dev;
237         const char              *name;
238         u64                     size;
239         struct list_head        node;
240         u64                     id;
241         u64                     features;
242 };
243
244 struct rbd_mapping {
245         u64                     size;
246         u64                     features;
247         bool                    read_only;
248 };
249
250 /*
251  * a single device
252  */
253 struct rbd_device {
254         int                     dev_id;         /* blkdev unique id */
255
256         int                     major;          /* blkdev assigned major */
257         struct gendisk          *disk;          /* blkdev's gendisk and rq */
258
259         u32                     image_format;   /* Either 1 or 2 */
260         struct rbd_client       *rbd_client;
261
262         char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
263
264         spinlock_t              lock;           /* queue, flags, open_count */
265
266         struct rbd_image_header header;
267         unsigned long           flags;          /* possibly lock protected */
268         struct rbd_spec         *spec;
269
270         char                    *header_name;
271
272         struct ceph_file_layout layout;
273
274         struct ceph_osd_event   *watch_event;
275         struct rbd_obj_request  *watch_request;
276
277         struct rbd_spec         *parent_spec;
278         u64                     parent_overlap;
279
280         /* protects updating the header */
281         struct rw_semaphore     header_rwsem;
282
283         struct rbd_mapping      mapping;
284
285         struct list_head        node;
286
287         /* list of snapshots */
288         struct list_head        snaps;
289
290         /* sysfs related */
291         struct device           dev;
292         unsigned long           open_count;     /* protected by lock */
293 };
294
295 /*
296  * Flag bits for rbd_dev->flags.  If atomicity is required,
297  * rbd_dev->lock is used to protect access.
298  *
299  * Currently, only the "removing" flag (which is coupled with the
300  * "open_count" field) requires atomic access.
301  */
302 enum rbd_dev_flags {
303         RBD_DEV_FLAG_EXISTS,    /* mapped snapshot has not been deleted */
304         RBD_DEV_FLAG_REMOVING,  /* this mapping is being removed */
305 };
306
307 static DEFINE_MUTEX(ctl_mutex);   /* Serialize open/close/setup/teardown */
308
309 static LIST_HEAD(rbd_dev_list);    /* devices */
310 static DEFINE_SPINLOCK(rbd_dev_list_lock);
311
312 static LIST_HEAD(rbd_client_list);              /* clients */
313 static DEFINE_SPINLOCK(rbd_client_list_lock);
314
315 static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
316 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
317
318 static void rbd_dev_release(struct device *dev);
319 static void rbd_remove_snap_dev(struct rbd_snap *snap);
320
321 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
322                        size_t count);
323 static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
324                           size_t count);
325
326 static struct bus_attribute rbd_bus_attrs[] = {
327         __ATTR(add, S_IWUSR, NULL, rbd_add),
328         __ATTR(remove, S_IWUSR, NULL, rbd_remove),
329         __ATTR_NULL
330 };
331
332 static struct bus_type rbd_bus_type = {
333         .name           = "rbd",
334         .bus_attrs      = rbd_bus_attrs,
335 };
336
337 static void rbd_root_dev_release(struct device *dev)
338 {
339 }
340
341 static struct device rbd_root_dev = {
342         .init_name =    "rbd",
343         .release =      rbd_root_dev_release,
344 };
345
346 static __printf(2, 3)
347 void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
348 {
349         struct va_format vaf;
350         va_list args;
351
352         va_start(args, fmt);
353         vaf.fmt = fmt;
354         vaf.va = &args;
355
356         if (!rbd_dev)
357                 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
358         else if (rbd_dev->disk)
359                 printk(KERN_WARNING "%s: %s: %pV\n",
360                         RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
361         else if (rbd_dev->spec && rbd_dev->spec->image_name)
362                 printk(KERN_WARNING "%s: image %s: %pV\n",
363                         RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
364         else if (rbd_dev->spec && rbd_dev->spec->image_id)
365                 printk(KERN_WARNING "%s: id %s: %pV\n",
366                         RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
367         else    /* punt */
368                 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
369                         RBD_DRV_NAME, rbd_dev, &vaf);
370         va_end(args);
371 }
372
373 #ifdef RBD_DEBUG
374 #define rbd_assert(expr)                                                \
375                 if (unlikely(!(expr))) {                                \
376                         printk(KERN_ERR "\nAssertion failure in %s() "  \
377                                                 "at line %d:\n\n"       \
378                                         "\trbd_assert(%s);\n\n",        \
379                                         __func__, __LINE__, #expr);     \
380                         BUG();                                          \
381                 }
382 #else /* !RBD_DEBUG */
383 #  define rbd_assert(expr)      ((void) 0)
384 #endif /* !RBD_DEBUG */
385
386 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
387 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
388
389 static int rbd_open(struct block_device *bdev, fmode_t mode)
390 {
391         struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
392         bool removing = false;
393
394         if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
395                 return -EROFS;
396
397         spin_lock_irq(&rbd_dev->lock);
398         if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
399                 removing = true;
400         else
401                 rbd_dev->open_count++;
402         spin_unlock_irq(&rbd_dev->lock);
403         if (removing)
404                 return -ENOENT;
405
406         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
407         (void) get_device(&rbd_dev->dev);
408         set_device_ro(bdev, rbd_dev->mapping.read_only);
409         mutex_unlock(&ctl_mutex);
410
411         return 0;
412 }
413
414 static int rbd_release(struct gendisk *disk, fmode_t mode)
415 {
416         struct rbd_device *rbd_dev = disk->private_data;
417         unsigned long open_count_before;
418
419         spin_lock_irq(&rbd_dev->lock);
420         open_count_before = rbd_dev->open_count--;
421         spin_unlock_irq(&rbd_dev->lock);
422         rbd_assert(open_count_before > 0);
423
424         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
425         put_device(&rbd_dev->dev);
426         mutex_unlock(&ctl_mutex);
427
428         return 0;
429 }
430
431 static const struct block_device_operations rbd_bd_ops = {
432         .owner                  = THIS_MODULE,
433         .open                   = rbd_open,
434         .release                = rbd_release,
435 };
436
437 /*
438  * Initialize an rbd client instance.
439  * We own *ceph_opts.
440  */
441 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
442 {
443         struct rbd_client *rbdc;
444         int ret = -ENOMEM;
445
446         dout("%s:\n", __func__);
447         rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
448         if (!rbdc)
449                 goto out_opt;
450
451         kref_init(&rbdc->kref);
452         INIT_LIST_HEAD(&rbdc->node);
453
454         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
455
456         rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
457         if (IS_ERR(rbdc->client))
458                 goto out_mutex;
459         ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
460
461         ret = ceph_open_session(rbdc->client);
462         if (ret < 0)
463                 goto out_err;
464
465         spin_lock(&rbd_client_list_lock);
466         list_add_tail(&rbdc->node, &rbd_client_list);
467         spin_unlock(&rbd_client_list_lock);
468
469         mutex_unlock(&ctl_mutex);
470         dout("%s: rbdc %p\n", __func__, rbdc);
471
472         return rbdc;
473
474 out_err:
475         ceph_destroy_client(rbdc->client);
476 out_mutex:
477         mutex_unlock(&ctl_mutex);
478         kfree(rbdc);
479 out_opt:
480         if (ceph_opts)
481                 ceph_destroy_options(ceph_opts);
482         dout("%s: error %d\n", __func__, ret);
483
484         return ERR_PTR(ret);
485 }
486
487 /*
488  * Find a ceph client with specific addr and configuration.  If
489  * found, bump its reference count.
490  */
491 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
492 {
493         struct rbd_client *client_node;
494         bool found = false;
495
496         if (ceph_opts->flags & CEPH_OPT_NOSHARE)
497                 return NULL;
498
499         spin_lock(&rbd_client_list_lock);
500         list_for_each_entry(client_node, &rbd_client_list, node) {
501                 if (!ceph_compare_options(ceph_opts, client_node->client)) {
502                         kref_get(&client_node->kref);
503                         found = true;
504                         break;
505                 }
506         }
507         spin_unlock(&rbd_client_list_lock);
508
509         return found ? client_node : NULL;
510 }
511
512 /*
513  * mount options
514  */
515 enum {
516         Opt_last_int,
517         /* int args above */
518         Opt_last_string,
519         /* string args above */
520         Opt_read_only,
521         Opt_read_write,
522         /* Boolean args above */
523         Opt_last_bool,
524 };
525
526 static match_table_t rbd_opts_tokens = {
527         /* int args above */
528         /* string args above */
529         {Opt_read_only, "read_only"},
530         {Opt_read_only, "ro"},          /* Alternate spelling */
531         {Opt_read_write, "read_write"},
532         {Opt_read_write, "rw"},         /* Alternate spelling */
533         /* Boolean args above */
534         {-1, NULL}
535 };
536
537 struct rbd_options {
538         bool    read_only;
539 };
540
541 #define RBD_READ_ONLY_DEFAULT   false
542
543 static int parse_rbd_opts_token(char *c, void *private)
544 {
545         struct rbd_options *rbd_opts = private;
546         substring_t argstr[MAX_OPT_ARGS];
547         int token, intval, ret;
548
549         token = match_token(c, rbd_opts_tokens, argstr);
550         if (token < 0)
551                 return -EINVAL;
552
553         if (token < Opt_last_int) {
554                 ret = match_int(&argstr[0], &intval);
555                 if (ret < 0) {
556                         pr_err("bad mount option arg (not int) "
557                                "at '%s'\n", c);
558                         return ret;
559                 }
560                 dout("got int token %d val %d\n", token, intval);
561         } else if (token > Opt_last_int && token < Opt_last_string) {
562                 dout("got string token %d val %s\n", token,
563                      argstr[0].from);
564         } else if (token > Opt_last_string && token < Opt_last_bool) {
565                 dout("got Boolean token %d\n", token);
566         } else {
567                 dout("got token %d\n", token);
568         }
569
570         switch (token) {
571         case Opt_read_only:
572                 rbd_opts->read_only = true;
573                 break;
574         case Opt_read_write:
575                 rbd_opts->read_only = false;
576                 break;
577         default:
578                 rbd_assert(false);
579                 break;
580         }
581         return 0;
582 }
583
584 /*
585  * Get a ceph client with specific addr and configuration, if one does
586  * not exist create it.
587  */
588 static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
589 {
590         struct rbd_client *rbdc;
591
592         rbdc = rbd_client_find(ceph_opts);
593         if (rbdc)       /* using an existing client */
594                 ceph_destroy_options(ceph_opts);
595         else
596                 rbdc = rbd_client_create(ceph_opts);
597
598         return rbdc;
599 }
600
601 /*
602  * Destroy ceph client
603  *
604  * Caller must hold rbd_client_list_lock.
605  */
606 static void rbd_client_release(struct kref *kref)
607 {
608         struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
609
610         dout("%s: rbdc %p\n", __func__, rbdc);
611         spin_lock(&rbd_client_list_lock);
612         list_del(&rbdc->node);
613         spin_unlock(&rbd_client_list_lock);
614
615         ceph_destroy_client(rbdc->client);
616         kfree(rbdc);
617 }
618
619 /*
620  * Drop reference to ceph client node. If it's not referenced anymore, release
621  * it.
622  */
623 static void rbd_put_client(struct rbd_client *rbdc)
624 {
625         if (rbdc)
626                 kref_put(&rbdc->kref, rbd_client_release);
627 }
628
629 static bool rbd_image_format_valid(u32 image_format)
630 {
631         return image_format == 1 || image_format == 2;
632 }
633
634 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
635 {
636         size_t size;
637         u32 snap_count;
638
639         /* The header has to start with the magic rbd header text */
640         if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
641                 return false;
642
643         /* The bio layer requires at least sector-sized I/O */
644
645         if (ondisk->options.order < SECTOR_SHIFT)
646                 return false;
647
648         /* If we use u64 in a few spots we may be able to loosen this */
649
650         if (ondisk->options.order > 8 * sizeof (int) - 1)
651                 return false;
652
653         /*
654          * The size of a snapshot header has to fit in a size_t, and
655          * that limits the number of snapshots.
656          */
657         snap_count = le32_to_cpu(ondisk->snap_count);
658         size = SIZE_MAX - sizeof (struct ceph_snap_context);
659         if (snap_count > size / sizeof (__le64))
660                 return false;
661
662         /*
663          * Not only that, but the size of the entire the snapshot
664          * header must also be representable in a size_t.
665          */
666         size -= snap_count * sizeof (__le64);
667         if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
668                 return false;
669
670         return true;
671 }
672
673 /*
674  * Create a new header structure, translate header format from the on-disk
675  * header.
676  */
677 static int rbd_header_from_disk(struct rbd_image_header *header,
678                                  struct rbd_image_header_ondisk *ondisk)
679 {
680         u32 snap_count;
681         size_t len;
682         size_t size;
683         u32 i;
684
685         memset(header, 0, sizeof (*header));
686
687         snap_count = le32_to_cpu(ondisk->snap_count);
688
689         len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
690         header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
691         if (!header->object_prefix)
692                 return -ENOMEM;
693         memcpy(header->object_prefix, ondisk->object_prefix, len);
694         header->object_prefix[len] = '\0';
695
696         if (snap_count) {
697                 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
698
699                 /* Save a copy of the snapshot names */
700
701                 if (snap_names_len > (u64) SIZE_MAX)
702                         return -EIO;
703                 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
704                 if (!header->snap_names)
705                         goto out_err;
706                 /*
707                  * Note that rbd_dev_v1_header_read() guarantees
708                  * the ondisk buffer we're working with has
709                  * snap_names_len bytes beyond the end of the
710                  * snapshot id array, this memcpy() is safe.
711                  */
712                 memcpy(header->snap_names, &ondisk->snaps[snap_count],
713                         snap_names_len);
714
715                 /* Record each snapshot's size */
716
717                 size = snap_count * sizeof (*header->snap_sizes);
718                 header->snap_sizes = kmalloc(size, GFP_KERNEL);
719                 if (!header->snap_sizes)
720                         goto out_err;
721                 for (i = 0; i < snap_count; i++)
722                         header->snap_sizes[i] =
723                                 le64_to_cpu(ondisk->snaps[i].image_size);
724         } else {
725                 WARN_ON(ondisk->snap_names_len);
726                 header->snap_names = NULL;
727                 header->snap_sizes = NULL;
728         }
729
730         header->features = 0;   /* No features support in v1 images */
731         header->obj_order = ondisk->options.order;
732         header->crypt_type = ondisk->options.crypt_type;
733         header->comp_type = ondisk->options.comp_type;
734
735         /* Allocate and fill in the snapshot context */
736
737         header->image_size = le64_to_cpu(ondisk->image_size);
738         size = sizeof (struct ceph_snap_context);
739         size += snap_count * sizeof (header->snapc->snaps[0]);
740         header->snapc = kzalloc(size, GFP_KERNEL);
741         if (!header->snapc)
742                 goto out_err;
743
744         atomic_set(&header->snapc->nref, 1);
745         header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
746         header->snapc->num_snaps = snap_count;
747         for (i = 0; i < snap_count; i++)
748                 header->snapc->snaps[i] =
749                         le64_to_cpu(ondisk->snaps[i].id);
750
751         return 0;
752
753 out_err:
754         kfree(header->snap_sizes);
755         header->snap_sizes = NULL;
756         kfree(header->snap_names);
757         header->snap_names = NULL;
758         kfree(header->object_prefix);
759         header->object_prefix = NULL;
760
761         return -ENOMEM;
762 }
763
764 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
765 {
766         struct rbd_snap *snap;
767
768         if (snap_id == CEPH_NOSNAP)
769                 return RBD_SNAP_HEAD_NAME;
770
771         list_for_each_entry(snap, &rbd_dev->snaps, node)
772                 if (snap_id == snap->id)
773                         return snap->name;
774
775         return NULL;
776 }
777
778 static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
779 {
780
781         struct rbd_snap *snap;
782
783         list_for_each_entry(snap, &rbd_dev->snaps, node) {
784                 if (!strcmp(snap_name, snap->name)) {
785                         rbd_dev->spec->snap_id = snap->id;
786                         rbd_dev->mapping.size = snap->size;
787                         rbd_dev->mapping.features = snap->features;
788
789                         return 0;
790                 }
791         }
792
793         return -ENOENT;
794 }
795
796 static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
797 {
798         int ret;
799
800         if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
801                     sizeof (RBD_SNAP_HEAD_NAME))) {
802                 rbd_dev->spec->snap_id = CEPH_NOSNAP;
803                 rbd_dev->mapping.size = rbd_dev->header.image_size;
804                 rbd_dev->mapping.features = rbd_dev->header.features;
805                 ret = 0;
806         } else {
807                 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
808                 if (ret < 0)
809                         goto done;
810                 rbd_dev->mapping.read_only = true;
811         }
812         set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
813
814 done:
815         return ret;
816 }
817
818 static void rbd_header_free(struct rbd_image_header *header)
819 {
820         kfree(header->object_prefix);
821         header->object_prefix = NULL;
822         kfree(header->snap_sizes);
823         header->snap_sizes = NULL;
824         kfree(header->snap_names);
825         header->snap_names = NULL;
826         ceph_put_snap_context(header->snapc);
827         header->snapc = NULL;
828 }
829
830 static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
831 {
832         char *name;
833         u64 segment;
834         int ret;
835
836         name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
837         if (!name)
838                 return NULL;
839         segment = offset >> rbd_dev->header.obj_order;
840         ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
841                         rbd_dev->header.object_prefix, segment);
842         if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
843                 pr_err("error formatting segment name for #%llu (%d)\n",
844                         segment, ret);
845                 kfree(name);
846                 name = NULL;
847         }
848
849         return name;
850 }
851
852 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
853 {
854         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
855
856         return offset & (segment_size - 1);
857 }
858
859 static u64 rbd_segment_length(struct rbd_device *rbd_dev,
860                                 u64 offset, u64 length)
861 {
862         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
863
864         offset &= segment_size - 1;
865
866         rbd_assert(length <= U64_MAX - offset);
867         if (offset + length > segment_size)
868                 length = segment_size - offset;
869
870         return length;
871 }
872
873 /*
874  * returns the size of an object in the image
875  */
876 static u64 rbd_obj_bytes(struct rbd_image_header *header)
877 {
878         return 1 << header->obj_order;
879 }
880
881 /*
882  * bio helpers
883  */
884
885 static void bio_chain_put(struct bio *chain)
886 {
887         struct bio *tmp;
888
889         while (chain) {
890                 tmp = chain;
891                 chain = chain->bi_next;
892                 bio_put(tmp);
893         }
894 }
895
896 /*
897  * zeros a bio chain, starting at specific offset
898  */
899 static void zero_bio_chain(struct bio *chain, int start_ofs)
900 {
901         struct bio_vec *bv;
902         unsigned long flags;
903         void *buf;
904         int i;
905         int pos = 0;
906
907         while (chain) {
908                 bio_for_each_segment(bv, chain, i) {
909                         if (pos + bv->bv_len > start_ofs) {
910                                 int remainder = max(start_ofs - pos, 0);
911                                 buf = bvec_kmap_irq(bv, &flags);
912                                 memset(buf + remainder, 0,
913                                        bv->bv_len - remainder);
914                                 bvec_kunmap_irq(buf, &flags);
915                         }
916                         pos += bv->bv_len;
917                 }
918
919                 chain = chain->bi_next;
920         }
921 }
922
923 /*
924  * Clone a portion of a bio, starting at the given byte offset
925  * and continuing for the number of bytes indicated.
926  */
927 static struct bio *bio_clone_range(struct bio *bio_src,
928                                         unsigned int offset,
929                                         unsigned int len,
930                                         gfp_t gfpmask)
931 {
932         struct bio_vec *bv;
933         unsigned int resid;
934         unsigned short idx;
935         unsigned int voff;
936         unsigned short end_idx;
937         unsigned short vcnt;
938         struct bio *bio;
939
940         /* Handle the easy case for the caller */
941
942         if (!offset && len == bio_src->bi_size)
943                 return bio_clone(bio_src, gfpmask);
944
945         if (WARN_ON_ONCE(!len))
946                 return NULL;
947         if (WARN_ON_ONCE(len > bio_src->bi_size))
948                 return NULL;
949         if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
950                 return NULL;
951
952         /* Find first affected segment... */
953
954         resid = offset;
955         __bio_for_each_segment(bv, bio_src, idx, 0) {
956                 if (resid < bv->bv_len)
957                         break;
958                 resid -= bv->bv_len;
959         }
960         voff = resid;
961
962         /* ...and the last affected segment */
963
964         resid += len;
965         __bio_for_each_segment(bv, bio_src, end_idx, idx) {
966                 if (resid <= bv->bv_len)
967                         break;
968                 resid -= bv->bv_len;
969         }
970         vcnt = end_idx - idx + 1;
971
972         /* Build the clone */
973
974         bio = bio_alloc(gfpmask, (unsigned int) vcnt);
975         if (!bio)
976                 return NULL;    /* ENOMEM */
977
978         bio->bi_bdev = bio_src->bi_bdev;
979         bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
980         bio->bi_rw = bio_src->bi_rw;
981         bio->bi_flags |= 1 << BIO_CLONED;
982
983         /*
984          * Copy over our part of the bio_vec, then update the first
985          * and last (or only) entries.
986          */
987         memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
988                         vcnt * sizeof (struct bio_vec));
989         bio->bi_io_vec[0].bv_offset += voff;
990         if (vcnt > 1) {
991                 bio->bi_io_vec[0].bv_len -= voff;
992                 bio->bi_io_vec[vcnt - 1].bv_len = resid;
993         } else {
994                 bio->bi_io_vec[0].bv_len = len;
995         }
996
997         bio->bi_vcnt = vcnt;
998         bio->bi_size = len;
999         bio->bi_idx = 0;
1000
1001         return bio;
1002 }
1003
1004 /*
1005  * Clone a portion of a bio chain, starting at the given byte offset
1006  * into the first bio in the source chain and continuing for the
1007  * number of bytes indicated.  The result is another bio chain of
1008  * exactly the given length, or a null pointer on error.
1009  *
1010  * The bio_src and offset parameters are both in-out.  On entry they
1011  * refer to the first source bio and the offset into that bio where
1012  * the start of data to be cloned is located.
1013  *
1014  * On return, bio_src is updated to refer to the bio in the source
1015  * chain that contains first un-cloned byte, and *offset will
1016  * contain the offset of that byte within that bio.
1017  */
1018 static struct bio *bio_chain_clone_range(struct bio **bio_src,
1019                                         unsigned int *offset,
1020                                         unsigned int len,
1021                                         gfp_t gfpmask)
1022 {
1023         struct bio *bi = *bio_src;
1024         unsigned int off = *offset;
1025         struct bio *chain = NULL;
1026         struct bio **end;
1027
1028         /* Build up a chain of clone bios up to the limit */
1029
1030         if (!bi || off >= bi->bi_size || !len)
1031                 return NULL;            /* Nothing to clone */
1032
1033         end = &chain;
1034         while (len) {
1035                 unsigned int bi_size;
1036                 struct bio *bio;
1037
1038                 if (!bi) {
1039                         rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1040                         goto out_err;   /* EINVAL; ran out of bio's */
1041                 }
1042                 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1043                 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1044                 if (!bio)
1045                         goto out_err;   /* ENOMEM */
1046
1047                 *end = bio;
1048                 end = &bio->bi_next;
1049
1050                 off += bi_size;
1051                 if (off == bi->bi_size) {
1052                         bi = bi->bi_next;
1053                         off = 0;
1054                 }
1055                 len -= bi_size;
1056         }
1057         *bio_src = bi;
1058         *offset = off;
1059
1060         return chain;
1061 out_err:
1062         bio_chain_put(chain);
1063
1064         return NULL;
1065 }
1066
1067 static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1068 {
1069         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1070                 atomic_read(&obj_request->kref.refcount));
1071         kref_get(&obj_request->kref);
1072 }
1073
1074 static void rbd_obj_request_destroy(struct kref *kref);
1075 static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1076 {
1077         rbd_assert(obj_request != NULL);
1078         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1079                 atomic_read(&obj_request->kref.refcount));
1080         kref_put(&obj_request->kref, rbd_obj_request_destroy);
1081 }
1082
1083 static void rbd_img_request_get(struct rbd_img_request *img_request)
1084 {
1085         dout("%s: img %p (was %d)\n", __func__, img_request,
1086                 atomic_read(&img_request->kref.refcount));
1087         kref_get(&img_request->kref);
1088 }
1089
1090 static void rbd_img_request_destroy(struct kref *kref);
1091 static void rbd_img_request_put(struct rbd_img_request *img_request)
1092 {
1093         rbd_assert(img_request != NULL);
1094         dout("%s: img %p (was %d)\n", __func__, img_request,
1095                 atomic_read(&img_request->kref.refcount));
1096         kref_put(&img_request->kref, rbd_img_request_destroy);
1097 }
1098
1099 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1100                                         struct rbd_obj_request *obj_request)
1101 {
1102         rbd_assert(obj_request->img_request == NULL);
1103
1104         rbd_obj_request_get(obj_request);
1105         obj_request->img_request = img_request;
1106         obj_request->which = img_request->obj_request_count;
1107         rbd_assert(obj_request->which != BAD_WHICH);
1108         img_request->obj_request_count++;
1109         list_add_tail(&obj_request->links, &img_request->obj_requests);
1110         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1111                 obj_request->which);
1112 }
1113
1114 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1115                                         struct rbd_obj_request *obj_request)
1116 {
1117         rbd_assert(obj_request->which != BAD_WHICH);
1118
1119         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1120                 obj_request->which);
1121         list_del(&obj_request->links);
1122         rbd_assert(img_request->obj_request_count > 0);
1123         img_request->obj_request_count--;
1124         rbd_assert(obj_request->which == img_request->obj_request_count);
1125         obj_request->which = BAD_WHICH;
1126         rbd_assert(obj_request->img_request == img_request);
1127         obj_request->img_request = NULL;
1128         obj_request->callback = NULL;
1129         rbd_obj_request_put(obj_request);
1130 }
1131
1132 static bool obj_request_type_valid(enum obj_request_type type)
1133 {
1134         switch (type) {
1135         case OBJ_REQUEST_NODATA:
1136         case OBJ_REQUEST_BIO:
1137         case OBJ_REQUEST_PAGES:
1138                 return true;
1139         default:
1140                 return false;
1141         }
1142 }
1143
1144 static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1145 {
1146         struct ceph_osd_req_op *op;
1147         va_list args;
1148         size_t size;
1149
1150         op = kzalloc(sizeof (*op), GFP_NOIO);
1151         if (!op)
1152                 return NULL;
1153         op->op = opcode;
1154         va_start(args, opcode);
1155         switch (opcode) {
1156         case CEPH_OSD_OP_READ:
1157         case CEPH_OSD_OP_WRITE:
1158                 /* rbd_osd_req_op_create(READ, offset, length) */
1159                 /* rbd_osd_req_op_create(WRITE, offset, length) */
1160                 op->extent.offset = va_arg(args, u64);
1161                 op->extent.length = va_arg(args, u64);
1162                 if (opcode == CEPH_OSD_OP_WRITE)
1163                         op->payload_len = op->extent.length;
1164                 break;
1165         case CEPH_OSD_OP_STAT:
1166                 break;
1167         case CEPH_OSD_OP_CALL:
1168                 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1169                 op->cls.class_name = va_arg(args, char *);
1170                 size = strlen(op->cls.class_name);
1171                 rbd_assert(size <= (size_t) U8_MAX);
1172                 op->cls.class_len = size;
1173                 op->payload_len = size;
1174
1175                 op->cls.method_name = va_arg(args, char *);
1176                 size = strlen(op->cls.method_name);
1177                 rbd_assert(size <= (size_t) U8_MAX);
1178                 op->cls.method_len = size;
1179                 op->payload_len += size;
1180
1181                 op->cls.argc = 0;
1182                 op->cls.indata = va_arg(args, void *);
1183                 size = va_arg(args, size_t);
1184                 rbd_assert(size <= (size_t) U32_MAX);
1185                 op->cls.indata_len = (u32) size;
1186                 op->payload_len += size;
1187                 break;
1188         case CEPH_OSD_OP_NOTIFY_ACK:
1189         case CEPH_OSD_OP_WATCH:
1190                 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1191                 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1192                 op->watch.cookie = va_arg(args, u64);
1193                 op->watch.ver = va_arg(args, u64);
1194                 op->watch.ver = cpu_to_le64(op->watch.ver);
1195                 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1196                         op->watch.flag = (u8) 1;
1197                 break;
1198         default:
1199                 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1200                 kfree(op);
1201                 op = NULL;
1202                 break;
1203         }
1204         va_end(args);
1205
1206         return op;
1207 }
1208
1209 static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1210 {
1211         kfree(op);
1212 }
1213
1214 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1215                                 struct rbd_obj_request *obj_request)
1216 {
1217         dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1218
1219         return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1220 }
1221
1222 static void rbd_img_request_complete(struct rbd_img_request *img_request)
1223 {
1224         dout("%s: img %p\n", __func__, img_request);
1225         if (img_request->callback)
1226                 img_request->callback(img_request);
1227         else
1228                 rbd_img_request_put(img_request);
1229 }
1230
1231 /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1232
1233 static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1234 {
1235         dout("%s: obj %p\n", __func__, obj_request);
1236
1237         return wait_for_completion_interruptible(&obj_request->completion);
1238 }
1239
1240 static void obj_request_done_init(struct rbd_obj_request *obj_request)
1241 {
1242         atomic_set(&obj_request->done, 0);
1243         smp_wmb();
1244 }
1245
1246 static void obj_request_done_set(struct rbd_obj_request *obj_request)
1247 {
1248         int done;
1249
1250         done = atomic_inc_return(&obj_request->done);
1251         if (done > 1) {
1252                 struct rbd_img_request *img_request = obj_request->img_request;
1253                 struct rbd_device *rbd_dev;
1254
1255                 rbd_dev = img_request ? img_request->rbd_dev : NULL;
1256                 rbd_warn(rbd_dev, "obj_request %p was already done\n",
1257                         obj_request);
1258         }
1259 }
1260
1261 static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1262 {
1263         smp_mb();
1264         return atomic_read(&obj_request->done) != 0;
1265 }
1266
1267 static void
1268 rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1269 {
1270         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1271                 obj_request, obj_request->img_request, obj_request->result,
1272                 obj_request->xferred, obj_request->length);
1273         /*
1274          * ENOENT means a hole in the image.  We zero-fill the
1275          * entire length of the request.  A short read also implies
1276          * zero-fill to the end of the request.  Either way we
1277          * update the xferred count to indicate the whole request
1278          * was satisfied.
1279          */
1280         BUG_ON(obj_request->type != OBJ_REQUEST_BIO);
1281         if (obj_request->result == -ENOENT) {
1282                 zero_bio_chain(obj_request->bio_list, 0);
1283                 obj_request->result = 0;
1284                 obj_request->xferred = obj_request->length;
1285         } else if (obj_request->xferred < obj_request->length &&
1286                         !obj_request->result) {
1287                 zero_bio_chain(obj_request->bio_list, obj_request->xferred);
1288                 obj_request->xferred = obj_request->length;
1289         }
1290         obj_request_done_set(obj_request);
1291 }
1292
1293 static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1294 {
1295         dout("%s: obj %p cb %p\n", __func__, obj_request,
1296                 obj_request->callback);
1297         if (obj_request->callback)
1298                 obj_request->callback(obj_request);
1299         else
1300                 complete_all(&obj_request->completion);
1301 }
1302
1303 static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1304 {
1305         dout("%s: obj %p\n", __func__, obj_request);
1306         obj_request_done_set(obj_request);
1307 }
1308
1309 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1310 {
1311         dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
1312                 obj_request->result, obj_request->xferred, obj_request->length);
1313         if (obj_request->img_request)
1314                 rbd_img_obj_request_read_callback(obj_request);
1315         else
1316                 obj_request_done_set(obj_request);
1317 }
1318
1319 static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1320 {
1321         dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1322                 obj_request->result, obj_request->length);
1323         /*
1324          * There is no such thing as a successful short write.
1325          * Our xferred value is the number of bytes transferred
1326          * back.  Set it to our originally-requested length.
1327          */
1328         obj_request->xferred = obj_request->length;
1329         obj_request_done_set(obj_request);
1330 }
1331
1332 /*
1333  * For a simple stat call there's nothing to do.  We'll do more if
1334  * this is part of a write sequence for a layered image.
1335  */
1336 static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1337 {
1338         dout("%s: obj %p\n", __func__, obj_request);
1339         obj_request_done_set(obj_request);
1340 }
1341
1342 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1343                                 struct ceph_msg *msg)
1344 {
1345         struct rbd_obj_request *obj_request = osd_req->r_priv;
1346         u16 opcode;
1347
1348         dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1349         rbd_assert(osd_req == obj_request->osd_req);
1350         rbd_assert(!!obj_request->img_request ^
1351                                 (obj_request->which == BAD_WHICH));
1352
1353         if (osd_req->r_result < 0)
1354                 obj_request->result = osd_req->r_result;
1355         obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1356
1357         WARN_ON(osd_req->r_num_ops != 1);       /* For now */
1358
1359         /*
1360          * We support a 64-bit length, but ultimately it has to be
1361          * passed to blk_end_request(), which takes an unsigned int.
1362          */
1363         obj_request->xferred = osd_req->r_reply_op_len[0];
1364         rbd_assert(obj_request->xferred < (u64) UINT_MAX);
1365         opcode = osd_req->r_request_ops[0].op;
1366         switch (opcode) {
1367         case CEPH_OSD_OP_READ:
1368                 rbd_osd_read_callback(obj_request);
1369                 break;
1370         case CEPH_OSD_OP_WRITE:
1371                 rbd_osd_write_callback(obj_request);
1372                 break;
1373         case CEPH_OSD_OP_STAT:
1374                 rbd_osd_stat_callback(obj_request);
1375                 break;
1376         case CEPH_OSD_OP_CALL:
1377         case CEPH_OSD_OP_NOTIFY_ACK:
1378         case CEPH_OSD_OP_WATCH:
1379                 rbd_osd_trivial_callback(obj_request);
1380                 break;
1381         default:
1382                 rbd_warn(NULL, "%s: unsupported op %hu\n",
1383                         obj_request->object_name, (unsigned short) opcode);
1384                 break;
1385         }
1386
1387         if (obj_request_done_test(obj_request))
1388                 rbd_obj_request_complete(obj_request);
1389 }
1390
1391 static struct ceph_osd_request *rbd_osd_req_create(
1392                                         struct rbd_device *rbd_dev,
1393                                         bool write_request,
1394                                         struct rbd_obj_request *obj_request,
1395                                         struct ceph_osd_req_op *op)
1396 {
1397         struct rbd_img_request *img_request = obj_request->img_request;
1398         struct ceph_snap_context *snapc = NULL;
1399         struct ceph_osd_client *osdc;
1400         struct ceph_osd_request *osd_req;
1401         struct ceph_osd_data *osd_data;
1402         struct timespec now;
1403         struct timespec *mtime;
1404         u64 snap_id = CEPH_NOSNAP;
1405         u64 offset = obj_request->offset;
1406         u64 length = obj_request->length;
1407
1408         if (img_request) {
1409                 rbd_assert(img_request->write_request == write_request);
1410                 if (img_request->write_request)
1411                         snapc = img_request->snapc;
1412                 else
1413                         snap_id = img_request->snap_id;
1414         }
1415
1416         /* Allocate and initialize the request, for the single op */
1417
1418         osdc = &rbd_dev->rbd_client->client->osdc;
1419         osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1420         if (!osd_req)
1421                 return NULL;    /* ENOMEM */
1422         osd_data = write_request ? &osd_req->r_data_out : &osd_req->r_data_in;
1423
1424         rbd_assert(obj_request_type_valid(obj_request->type));
1425         switch (obj_request->type) {
1426         case OBJ_REQUEST_NODATA:
1427                 break;          /* Nothing to do */
1428         case OBJ_REQUEST_BIO:
1429                 rbd_assert(obj_request->bio_list != NULL);
1430                 osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
1431                 osd_data->bio = obj_request->bio_list;
1432                 break;
1433         case OBJ_REQUEST_PAGES:
1434                 osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
1435                 osd_data->pages = obj_request->pages;
1436                 osd_data->length = obj_request->length;
1437                 osd_data->alignment = offset & ~PAGE_MASK;
1438                 osd_data->pages_from_pool = false;
1439                 osd_data->own_pages = false;
1440                 break;
1441         }
1442
1443         if (write_request) {
1444                 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1445                 now = CURRENT_TIME;
1446                 mtime = &now;
1447         } else {
1448                 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1449                 mtime = NULL;   /* not needed for reads */
1450                 offset = 0;     /* These are not used... */
1451                 length = 0;     /* ...for osd read requests */
1452         }
1453
1454         osd_req->r_callback = rbd_osd_req_callback;
1455         osd_req->r_priv = obj_request;
1456
1457         osd_req->r_oid_len = strlen(obj_request->object_name);
1458         rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1459         memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1460
1461         osd_req->r_file_layout = rbd_dev->layout;       /* struct */
1462
1463         /* osd_req will get its own reference to snapc (if non-null) */
1464
1465         ceph_osdc_build_request(osd_req, offset, 1, op,
1466                                 snapc, snap_id, mtime);
1467
1468         return osd_req;
1469 }
1470
1471 static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1472 {
1473         ceph_osdc_put_request(osd_req);
1474 }
1475
1476 /* object_name is assumed to be a non-null pointer and NUL-terminated */
1477
1478 static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1479                                                 u64 offset, u64 length,
1480                                                 enum obj_request_type type)
1481 {
1482         struct rbd_obj_request *obj_request;
1483         size_t size;
1484         char *name;
1485
1486         rbd_assert(obj_request_type_valid(type));
1487
1488         size = strlen(object_name) + 1;
1489         obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1490         if (!obj_request)
1491                 return NULL;
1492
1493         name = (char *)(obj_request + 1);
1494         obj_request->object_name = memcpy(name, object_name, size);
1495         obj_request->offset = offset;
1496         obj_request->length = length;
1497         obj_request->which = BAD_WHICH;
1498         obj_request->type = type;
1499         INIT_LIST_HEAD(&obj_request->links);
1500         obj_request_done_init(obj_request);
1501         init_completion(&obj_request->completion);
1502         kref_init(&obj_request->kref);
1503
1504         dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1505                 offset, length, (int)type, obj_request);
1506
1507         return obj_request;
1508 }
1509
1510 static void rbd_obj_request_destroy(struct kref *kref)
1511 {
1512         struct rbd_obj_request *obj_request;
1513
1514         obj_request = container_of(kref, struct rbd_obj_request, kref);
1515
1516         dout("%s: obj %p\n", __func__, obj_request);
1517
1518         rbd_assert(obj_request->img_request == NULL);
1519         rbd_assert(obj_request->which == BAD_WHICH);
1520
1521         if (obj_request->osd_req)
1522                 rbd_osd_req_destroy(obj_request->osd_req);
1523
1524         rbd_assert(obj_request_type_valid(obj_request->type));
1525         switch (obj_request->type) {
1526         case OBJ_REQUEST_NODATA:
1527                 break;          /* Nothing to do */
1528         case OBJ_REQUEST_BIO:
1529                 if (obj_request->bio_list)
1530                         bio_chain_put(obj_request->bio_list);
1531                 break;
1532         case OBJ_REQUEST_PAGES:
1533                 if (obj_request->pages)
1534                         ceph_release_page_vector(obj_request->pages,
1535                                                 obj_request->page_count);
1536                 break;
1537         }
1538
1539         kfree(obj_request);
1540 }
1541
1542 /*
1543  * Caller is responsible for filling in the list of object requests
1544  * that comprises the image request, and the Linux request pointer
1545  * (if there is one).
1546  */
1547 static struct rbd_img_request *rbd_img_request_create(
1548                                         struct rbd_device *rbd_dev,
1549                                         u64 offset, u64 length,
1550                                         bool write_request)
1551 {
1552         struct rbd_img_request *img_request;
1553         struct ceph_snap_context *snapc = NULL;
1554
1555         img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1556         if (!img_request)
1557                 return NULL;
1558
1559         if (write_request) {
1560                 down_read(&rbd_dev->header_rwsem);
1561                 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1562                 up_read(&rbd_dev->header_rwsem);
1563                 if (WARN_ON(!snapc)) {
1564                         kfree(img_request);
1565                         return NULL;    /* Shouldn't happen */
1566                 }
1567         }
1568
1569         img_request->rq = NULL;
1570         img_request->rbd_dev = rbd_dev;
1571         img_request->offset = offset;
1572         img_request->length = length;
1573         img_request->write_request = write_request;
1574         if (write_request)
1575                 img_request->snapc = snapc;
1576         else
1577                 img_request->snap_id = rbd_dev->spec->snap_id;
1578         spin_lock_init(&img_request->completion_lock);
1579         img_request->next_completion = 0;
1580         img_request->callback = NULL;
1581         img_request->obj_request_count = 0;
1582         INIT_LIST_HEAD(&img_request->obj_requests);
1583         kref_init(&img_request->kref);
1584
1585         rbd_img_request_get(img_request);       /* Avoid a warning */
1586         rbd_img_request_put(img_request);       /* TEMPORARY */
1587
1588         dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1589                 write_request ? "write" : "read", offset, length,
1590                 img_request);
1591
1592         return img_request;
1593 }
1594
1595 static void rbd_img_request_destroy(struct kref *kref)
1596 {
1597         struct rbd_img_request *img_request;
1598         struct rbd_obj_request *obj_request;
1599         struct rbd_obj_request *next_obj_request;
1600
1601         img_request = container_of(kref, struct rbd_img_request, kref);
1602
1603         dout("%s: img %p\n", __func__, img_request);
1604
1605         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1606                 rbd_img_obj_request_del(img_request, obj_request);
1607         rbd_assert(img_request->obj_request_count == 0);
1608
1609         if (img_request->write_request)
1610                 ceph_put_snap_context(img_request->snapc);
1611
1612         kfree(img_request);
1613 }
1614
1615 static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1616                                         struct bio *bio_list)
1617 {
1618         struct rbd_device *rbd_dev = img_request->rbd_dev;
1619         struct rbd_obj_request *obj_request = NULL;
1620         struct rbd_obj_request *next_obj_request;
1621         unsigned int bio_offset;
1622         u64 image_offset;
1623         u64 resid;
1624         u16 opcode;
1625
1626         dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
1627
1628         opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1629                                               : CEPH_OSD_OP_READ;
1630         bio_offset = 0;
1631         image_offset = img_request->offset;
1632         rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1633         resid = img_request->length;
1634         rbd_assert(resid > 0);
1635         while (resid) {
1636                 const char *object_name;
1637                 unsigned int clone_size;
1638                 struct ceph_osd_req_op *op;
1639                 u64 offset;
1640                 u64 length;
1641
1642                 object_name = rbd_segment_name(rbd_dev, image_offset);
1643                 if (!object_name)
1644                         goto out_unwind;
1645                 offset = rbd_segment_offset(rbd_dev, image_offset);
1646                 length = rbd_segment_length(rbd_dev, image_offset, resid);
1647                 obj_request = rbd_obj_request_create(object_name,
1648                                                 offset, length,
1649                                                 OBJ_REQUEST_BIO);
1650                 kfree(object_name);     /* object request has its own copy */
1651                 if (!obj_request)
1652                         goto out_unwind;
1653
1654                 rbd_assert(length <= (u64) UINT_MAX);
1655                 clone_size = (unsigned int) length;
1656                 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1657                                                 &bio_offset, clone_size,
1658                                                 GFP_ATOMIC);
1659                 if (!obj_request->bio_list)
1660                         goto out_partial;
1661
1662                 /*
1663                  * Build up the op to use in building the osd
1664                  * request.  Note that the contents of the op are
1665                  * copied by rbd_osd_req_create().
1666                  */
1667                 op = rbd_osd_req_op_create(opcode, offset, length);
1668                 if (!op)
1669                         goto out_partial;
1670                 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1671                                                 img_request->write_request,
1672                                                 obj_request, op);
1673                 rbd_osd_req_op_destroy(op);
1674                 if (!obj_request->osd_req)
1675                         goto out_partial;
1676                 /* status and version are initially zero-filled */
1677
1678                 rbd_img_obj_request_add(img_request, obj_request);
1679
1680                 image_offset += length;
1681                 resid -= length;
1682         }
1683
1684         return 0;
1685
1686 out_partial:
1687         rbd_obj_request_put(obj_request);
1688 out_unwind:
1689         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1690                 rbd_obj_request_put(obj_request);
1691
1692         return -ENOMEM;
1693 }
1694
1695 static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1696 {
1697         struct rbd_img_request *img_request;
1698         u32 which = obj_request->which;
1699         bool more = true;
1700
1701         img_request = obj_request->img_request;
1702
1703         dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1704         rbd_assert(img_request != NULL);
1705         rbd_assert(img_request->rq != NULL);
1706         rbd_assert(img_request->obj_request_count > 0);
1707         rbd_assert(which != BAD_WHICH);
1708         rbd_assert(which < img_request->obj_request_count);
1709         rbd_assert(which >= img_request->next_completion);
1710
1711         spin_lock_irq(&img_request->completion_lock);
1712         if (which != img_request->next_completion)
1713                 goto out;
1714
1715         for_each_obj_request_from(img_request, obj_request) {
1716                 unsigned int xferred;
1717                 int result;
1718
1719                 rbd_assert(more);
1720                 rbd_assert(which < img_request->obj_request_count);
1721
1722                 if (!obj_request_done_test(obj_request))
1723                         break;
1724
1725                 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1726                 xferred = (unsigned int) obj_request->xferred;
1727                 result = (int) obj_request->result;
1728                 if (result)
1729                         rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1730                                 img_request->write_request ? "write" : "read",
1731                                 result, xferred);
1732
1733                 more = blk_end_request(img_request->rq, result, xferred);
1734                 which++;
1735         }
1736
1737         rbd_assert(more ^ (which == img_request->obj_request_count));
1738         img_request->next_completion = which;
1739 out:
1740         spin_unlock_irq(&img_request->completion_lock);
1741
1742         if (!more)
1743                 rbd_img_request_complete(img_request);
1744 }
1745
1746 static int rbd_img_request_submit(struct rbd_img_request *img_request)
1747 {
1748         struct rbd_device *rbd_dev = img_request->rbd_dev;
1749         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1750         struct rbd_obj_request *obj_request;
1751         struct rbd_obj_request *next_obj_request;
1752
1753         dout("%s: img %p\n", __func__, img_request);
1754         for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
1755                 int ret;
1756
1757                 obj_request->callback = rbd_img_obj_callback;
1758                 ret = rbd_obj_request_submit(osdc, obj_request);
1759                 if (ret)
1760                         return ret;
1761                 /*
1762                  * The image request has its own reference to each
1763                  * of its object requests, so we can safely drop the
1764                  * initial one here.
1765                  */
1766                 rbd_obj_request_put(obj_request);
1767         }
1768
1769         return 0;
1770 }
1771
1772 static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
1773                                    u64 ver, u64 notify_id)
1774 {
1775         struct rbd_obj_request *obj_request;
1776         struct ceph_osd_req_op *op;
1777         struct ceph_osd_client *osdc;
1778         int ret;
1779
1780         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1781                                                         OBJ_REQUEST_NODATA);
1782         if (!obj_request)
1783                 return -ENOMEM;
1784
1785         ret = -ENOMEM;
1786         op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
1787         if (!op)
1788                 goto out;
1789         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1790                                                 obj_request, op);
1791         rbd_osd_req_op_destroy(op);
1792         if (!obj_request->osd_req)
1793                 goto out;
1794
1795         osdc = &rbd_dev->rbd_client->client->osdc;
1796         obj_request->callback = rbd_obj_request_put;
1797         ret = rbd_obj_request_submit(osdc, obj_request);
1798 out:
1799         if (ret)
1800                 rbd_obj_request_put(obj_request);
1801
1802         return ret;
1803 }
1804
1805 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1806 {
1807         struct rbd_device *rbd_dev = (struct rbd_device *)data;
1808         u64 hver;
1809         int rc;
1810
1811         if (!rbd_dev)
1812                 return;
1813
1814         dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
1815                 rbd_dev->header_name, (unsigned long long) notify_id,
1816                 (unsigned int) opcode);
1817         rc = rbd_dev_refresh(rbd_dev, &hver);
1818         if (rc)
1819                 rbd_warn(rbd_dev, "got notification but failed to "
1820                            " update snaps: %d\n", rc);
1821
1822         rbd_obj_notify_ack(rbd_dev, hver, notify_id);
1823 }
1824
1825 /*
1826  * Request sync osd watch/unwatch.  The value of "start" determines
1827  * whether a watch request is being initiated or torn down.
1828  */
1829 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1830 {
1831         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1832         struct rbd_obj_request *obj_request;
1833         struct ceph_osd_req_op *op;
1834         int ret;
1835
1836         rbd_assert(start ^ !!rbd_dev->watch_event);
1837         rbd_assert(start ^ !!rbd_dev->watch_request);
1838
1839         if (start) {
1840                 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
1841                                                 &rbd_dev->watch_event);
1842                 if (ret < 0)
1843                         return ret;
1844                 rbd_assert(rbd_dev->watch_event != NULL);
1845         }
1846
1847         ret = -ENOMEM;
1848         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1849                                                         OBJ_REQUEST_NODATA);
1850         if (!obj_request)
1851                 goto out_cancel;
1852
1853         op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1854                                 rbd_dev->watch_event->cookie,
1855                                 rbd_dev->header.obj_version, start);
1856         if (!op)
1857                 goto out_cancel;
1858         obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
1859                                                         obj_request, op);
1860         rbd_osd_req_op_destroy(op);
1861         if (!obj_request->osd_req)
1862                 goto out_cancel;
1863
1864         if (start)
1865                 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
1866         else
1867                 ceph_osdc_unregister_linger_request(osdc,
1868                                         rbd_dev->watch_request->osd_req);
1869         ret = rbd_obj_request_submit(osdc, obj_request);
1870         if (ret)
1871                 goto out_cancel;
1872         ret = rbd_obj_request_wait(obj_request);
1873         if (ret)
1874                 goto out_cancel;
1875         ret = obj_request->result;
1876         if (ret)
1877                 goto out_cancel;
1878
1879         /*
1880          * A watch request is set to linger, so the underlying osd
1881          * request won't go away until we unregister it.  We retain
1882          * a pointer to the object request during that time (in
1883          * rbd_dev->watch_request), so we'll keep a reference to
1884          * it.  We'll drop that reference (below) after we've
1885          * unregistered it.
1886          */
1887         if (start) {
1888                 rbd_dev->watch_request = obj_request;
1889
1890                 return 0;
1891         }
1892
1893         /* We have successfully torn down the watch request */
1894
1895         rbd_obj_request_put(rbd_dev->watch_request);
1896         rbd_dev->watch_request = NULL;
1897 out_cancel:
1898         /* Cancel the event if we're tearing down, or on error */
1899         ceph_osdc_cancel_event(rbd_dev->watch_event);
1900         rbd_dev->watch_event = NULL;
1901         if (obj_request)
1902                 rbd_obj_request_put(obj_request);
1903
1904         return ret;
1905 }
1906
1907 /*
1908  * Synchronous osd object method call
1909  */
1910 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1911                              const char *object_name,
1912                              const char *class_name,
1913                              const char *method_name,
1914                              const char *outbound,
1915                              size_t outbound_size,
1916                              char *inbound,
1917                              size_t inbound_size,
1918                              u64 *version)
1919 {
1920         struct rbd_obj_request *obj_request;
1921         struct ceph_osd_client *osdc;
1922         struct ceph_osd_req_op *op;
1923         struct page **pages;
1924         u32 page_count;
1925         int ret;
1926
1927         /*
1928          * Method calls are ultimately read operations but they
1929          * don't involve object data (so no offset or length).
1930          * The result should placed into the inbound buffer
1931          * provided.  They also supply outbound data--parameters for
1932          * the object method.  Currently if this is present it will
1933          * be a snapshot id.
1934          */
1935         page_count = (u32) calc_pages_for(0, inbound_size);
1936         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1937         if (IS_ERR(pages))
1938                 return PTR_ERR(pages);
1939
1940         ret = -ENOMEM;
1941         obj_request = rbd_obj_request_create(object_name, 0, 0,
1942                                                         OBJ_REQUEST_PAGES);
1943         if (!obj_request)
1944                 goto out;
1945
1946         obj_request->pages = pages;
1947         obj_request->page_count = page_count;
1948
1949         op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1950                                         method_name, outbound, outbound_size);
1951         if (!op)
1952                 goto out;
1953         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1954                                                 obj_request, op);
1955         rbd_osd_req_op_destroy(op);
1956         if (!obj_request->osd_req)
1957                 goto out;
1958
1959         osdc = &rbd_dev->rbd_client->client->osdc;
1960         ret = rbd_obj_request_submit(osdc, obj_request);
1961         if (ret)
1962                 goto out;
1963         ret = rbd_obj_request_wait(obj_request);
1964         if (ret)
1965                 goto out;
1966
1967         ret = obj_request->result;
1968         if (ret < 0)
1969                 goto out;
1970         ret = 0;
1971         ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
1972         if (version)
1973                 *version = obj_request->version;
1974 out:
1975         if (obj_request)
1976                 rbd_obj_request_put(obj_request);
1977         else
1978                 ceph_release_page_vector(pages, page_count);
1979
1980         return ret;
1981 }
1982
1983 static void rbd_request_fn(struct request_queue *q)
1984                 __releases(q->queue_lock) __acquires(q->queue_lock)
1985 {
1986         struct rbd_device *rbd_dev = q->queuedata;
1987         bool read_only = rbd_dev->mapping.read_only;
1988         struct request *rq;
1989         int result;
1990
1991         while ((rq = blk_fetch_request(q))) {
1992                 bool write_request = rq_data_dir(rq) == WRITE;
1993                 struct rbd_img_request *img_request;
1994                 u64 offset;
1995                 u64 length;
1996
1997                 /* Ignore any non-FS requests that filter through. */
1998
1999                 if (rq->cmd_type != REQ_TYPE_FS) {
2000                         dout("%s: non-fs request type %d\n", __func__,
2001                                 (int) rq->cmd_type);
2002                         __blk_end_request_all(rq, 0);
2003                         continue;
2004                 }
2005
2006                 /* Ignore/skip any zero-length requests */
2007
2008                 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2009                 length = (u64) blk_rq_bytes(rq);
2010
2011                 if (!length) {
2012                         dout("%s: zero-length request\n", __func__);
2013                         __blk_end_request_all(rq, 0);
2014                         continue;
2015                 }
2016
2017                 spin_unlock_irq(q->queue_lock);
2018
2019                 /* Disallow writes to a read-only device */
2020
2021                 if (write_request) {
2022                         result = -EROFS;
2023                         if (read_only)
2024                                 goto end_request;
2025                         rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2026                 }
2027
2028                 /*
2029                  * Quit early if the mapped snapshot no longer
2030                  * exists.  It's still possible the snapshot will
2031                  * have disappeared by the time our request arrives
2032                  * at the osd, but there's no sense in sending it if
2033                  * we already know.
2034                  */
2035                 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2036                         dout("request for non-existent snapshot");
2037                         rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2038                         result = -ENXIO;
2039                         goto end_request;
2040                 }
2041
2042                 result = -EINVAL;
2043                 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2044                         goto end_request;       /* Shouldn't happen */
2045
2046                 result = -ENOMEM;
2047                 img_request = rbd_img_request_create(rbd_dev, offset, length,
2048                                                         write_request);
2049                 if (!img_request)
2050                         goto end_request;
2051
2052                 img_request->rq = rq;
2053
2054                 result = rbd_img_request_fill_bio(img_request, rq->bio);
2055                 if (!result)
2056                         result = rbd_img_request_submit(img_request);
2057                 if (result)
2058                         rbd_img_request_put(img_request);
2059 end_request:
2060                 spin_lock_irq(q->queue_lock);
2061                 if (result < 0) {
2062                         rbd_warn(rbd_dev, "obj_request %s result %d\n",
2063                                 write_request ? "write" : "read", result);
2064                         __blk_end_request_all(rq, result);
2065                 }
2066         }
2067 }
2068
2069 /*
2070  * a queue callback. Makes sure that we don't create a bio that spans across
2071  * multiple osd objects. One exception would be with a single page bios,
2072  * which we handle later at bio_chain_clone_range()
2073  */
2074 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2075                           struct bio_vec *bvec)
2076 {
2077         struct rbd_device *rbd_dev = q->queuedata;
2078         sector_t sector_offset;
2079         sector_t sectors_per_obj;
2080         sector_t obj_sector_offset;
2081         int ret;
2082
2083         /*
2084          * Find how far into its rbd object the partition-relative
2085          * bio start sector is to offset relative to the enclosing
2086          * device.
2087          */
2088         sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2089         sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2090         obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2091
2092         /*
2093          * Compute the number of bytes from that offset to the end
2094          * of the object.  Account for what's already used by the bio.
2095          */
2096         ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2097         if (ret > bmd->bi_size)
2098                 ret -= bmd->bi_size;
2099         else
2100                 ret = 0;
2101
2102         /*
2103          * Don't send back more than was asked for.  And if the bio
2104          * was empty, let the whole thing through because:  "Note
2105          * that a block device *must* allow a single page to be
2106          * added to an empty bio."
2107          */
2108         rbd_assert(bvec->bv_len <= PAGE_SIZE);
2109         if (ret > (int) bvec->bv_len || !bmd->bi_size)
2110                 ret = (int) bvec->bv_len;
2111
2112         return ret;
2113 }
2114
2115 static void rbd_free_disk(struct rbd_device *rbd_dev)
2116 {
2117         struct gendisk *disk = rbd_dev->disk;
2118
2119         if (!disk)
2120                 return;
2121
2122         if (disk->flags & GENHD_FL_UP)
2123                 del_gendisk(disk);
2124         if (disk->queue)
2125                 blk_cleanup_queue(disk->queue);
2126         put_disk(disk);
2127 }
2128
2129 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2130                                 const char *object_name,
2131                                 u64 offset, u64 length,
2132                                 char *buf, u64 *version)
2133
2134 {
2135         struct ceph_osd_req_op *op;
2136         struct rbd_obj_request *obj_request;
2137         struct ceph_osd_client *osdc;
2138         struct page **pages = NULL;
2139         u32 page_count;
2140         size_t size;
2141         int ret;
2142
2143         page_count = (u32) calc_pages_for(offset, length);
2144         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2145         if (IS_ERR(pages))
2146                 ret = PTR_ERR(pages);
2147
2148         ret = -ENOMEM;
2149         obj_request = rbd_obj_request_create(object_name, offset, length,
2150                                                         OBJ_REQUEST_PAGES);
2151         if (!obj_request)
2152                 goto out;
2153
2154         obj_request->pages = pages;
2155         obj_request->page_count = page_count;
2156
2157         op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
2158         if (!op)
2159                 goto out;
2160         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2161                                                 obj_request, op);
2162         rbd_osd_req_op_destroy(op);
2163         if (!obj_request->osd_req)
2164                 goto out;
2165
2166         osdc = &rbd_dev->rbd_client->client->osdc;
2167         ret = rbd_obj_request_submit(osdc, obj_request);
2168         if (ret)
2169                 goto out;
2170         ret = rbd_obj_request_wait(obj_request);
2171         if (ret)
2172                 goto out;
2173
2174         ret = obj_request->result;
2175         if (ret < 0)
2176                 goto out;
2177
2178         rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2179         size = (size_t) obj_request->xferred;
2180         ceph_copy_from_page_vector(pages, buf, 0, size);
2181         rbd_assert(size <= (size_t) INT_MAX);
2182         ret = (int) size;
2183         if (version)
2184                 *version = obj_request->version;
2185 out:
2186         if (obj_request)
2187                 rbd_obj_request_put(obj_request);
2188         else
2189                 ceph_release_page_vector(pages, page_count);
2190
2191         return ret;
2192 }
2193
2194 /*
2195  * Read the complete header for the given rbd device.
2196  *
2197  * Returns a pointer to a dynamically-allocated buffer containing
2198  * the complete and validated header.  Caller can pass the address
2199  * of a variable that will be filled in with the version of the
2200  * header object at the time it was read.
2201  *
2202  * Returns a pointer-coded errno if a failure occurs.
2203  */
2204 static struct rbd_image_header_ondisk *
2205 rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2206 {
2207         struct rbd_image_header_ondisk *ondisk = NULL;
2208         u32 snap_count = 0;
2209         u64 names_size = 0;
2210         u32 want_count;
2211         int ret;
2212
2213         /*
2214          * The complete header will include an array of its 64-bit
2215          * snapshot ids, followed by the names of those snapshots as
2216          * a contiguous block of NUL-terminated strings.  Note that
2217          * the number of snapshots could change by the time we read
2218          * it in, in which case we re-read it.
2219          */
2220         do {
2221                 size_t size;
2222
2223                 kfree(ondisk);
2224
2225                 size = sizeof (*ondisk);
2226                 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2227                 size += names_size;
2228                 ondisk = kmalloc(size, GFP_KERNEL);
2229                 if (!ondisk)
2230                         return ERR_PTR(-ENOMEM);
2231
2232                 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
2233                                        0, size,
2234                                        (char *) ondisk, version);
2235                 if (ret < 0)
2236                         goto out_err;
2237                 if (WARN_ON((size_t) ret < size)) {
2238                         ret = -ENXIO;
2239                         rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2240                                 size, ret);
2241                         goto out_err;
2242                 }
2243                 if (!rbd_dev_ondisk_valid(ondisk)) {
2244                         ret = -ENXIO;
2245                         rbd_warn(rbd_dev, "invalid header");
2246                         goto out_err;
2247                 }
2248
2249                 names_size = le64_to_cpu(ondisk->snap_names_len);
2250                 want_count = snap_count;
2251                 snap_count = le32_to_cpu(ondisk->snap_count);
2252         } while (snap_count != want_count);
2253
2254         return ondisk;
2255
2256 out_err:
2257         kfree(ondisk);
2258
2259         return ERR_PTR(ret);
2260 }
2261
2262 /*
2263  * reload the ondisk the header
2264  */
2265 static int rbd_read_header(struct rbd_device *rbd_dev,
2266                            struct rbd_image_header *header)
2267 {
2268         struct rbd_image_header_ondisk *ondisk;
2269         u64 ver = 0;
2270         int ret;
2271
2272         ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2273         if (IS_ERR(ondisk))
2274                 return PTR_ERR(ondisk);
2275         ret = rbd_header_from_disk(header, ondisk);
2276         if (ret >= 0)
2277                 header->obj_version = ver;
2278         kfree(ondisk);
2279
2280         return ret;
2281 }
2282
2283 static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
2284 {
2285         struct rbd_snap *snap;
2286         struct rbd_snap *next;
2287
2288         list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
2289                 rbd_remove_snap_dev(snap);
2290 }
2291
2292 static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2293 {
2294         sector_t size;
2295
2296         if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
2297                 return;
2298
2299         size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2300         dout("setting size to %llu sectors", (unsigned long long) size);
2301         rbd_dev->mapping.size = (u64) size;
2302         set_capacity(rbd_dev->disk, size);
2303 }
2304
2305 /*
2306  * only read the first part of the ondisk header, without the snaps info
2307  */
2308 static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
2309 {
2310         int ret;
2311         struct rbd_image_header h;
2312
2313         ret = rbd_read_header(rbd_dev, &h);
2314         if (ret < 0)
2315                 return ret;
2316
2317         down_write(&rbd_dev->header_rwsem);
2318
2319         /* Update image size, and check for resize of mapped image */
2320         rbd_dev->header.image_size = h.image_size;
2321         rbd_update_mapping_size(rbd_dev);
2322
2323         /* rbd_dev->header.object_prefix shouldn't change */
2324         kfree(rbd_dev->header.snap_sizes);
2325         kfree(rbd_dev->header.snap_names);
2326         /* osd requests may still refer to snapc */
2327         ceph_put_snap_context(rbd_dev->header.snapc);
2328
2329         if (hver)
2330                 *hver = h.obj_version;
2331         rbd_dev->header.obj_version = h.obj_version;
2332         rbd_dev->header.image_size = h.image_size;
2333         rbd_dev->header.snapc = h.snapc;
2334         rbd_dev->header.snap_names = h.snap_names;
2335         rbd_dev->header.snap_sizes = h.snap_sizes;
2336         /* Free the extra copy of the object prefix */
2337         WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2338         kfree(h.object_prefix);
2339
2340         ret = rbd_dev_snaps_update(rbd_dev);
2341         if (!ret)
2342                 ret = rbd_dev_snaps_register(rbd_dev);
2343
2344         up_write(&rbd_dev->header_rwsem);
2345
2346         return ret;
2347 }
2348
2349 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
2350 {
2351         int ret;
2352
2353         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
2354         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2355         if (rbd_dev->image_format == 1)
2356                 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2357         else
2358                 ret = rbd_dev_v2_refresh(rbd_dev, hver);
2359         mutex_unlock(&ctl_mutex);
2360
2361         return ret;
2362 }
2363
2364 static int rbd_init_disk(struct rbd_device *rbd_dev)
2365 {
2366         struct gendisk *disk;
2367         struct request_queue *q;
2368         u64 segment_size;
2369
2370         /* create gendisk info */
2371         disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2372         if (!disk)
2373                 return -ENOMEM;
2374
2375         snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
2376                  rbd_dev->dev_id);
2377         disk->major = rbd_dev->major;
2378         disk->first_minor = 0;
2379         disk->fops = &rbd_bd_ops;
2380         disk->private_data = rbd_dev;
2381
2382         q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
2383         if (!q)
2384                 goto out_disk;
2385
2386         /* We use the default size, but let's be explicit about it. */
2387         blk_queue_physical_block_size(q, SECTOR_SIZE);
2388
2389         /* set io sizes to object size */
2390         segment_size = rbd_obj_bytes(&rbd_dev->header);
2391         blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2392         blk_queue_max_segment_size(q, segment_size);
2393         blk_queue_io_min(q, segment_size);
2394         blk_queue_io_opt(q, segment_size);
2395
2396         blk_queue_merge_bvec(q, rbd_merge_bvec);
2397         disk->queue = q;
2398
2399         q->queuedata = rbd_dev;
2400
2401         rbd_dev->disk = disk;
2402
2403         set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2404
2405         return 0;
2406 out_disk:
2407         put_disk(disk);
2408
2409         return -ENOMEM;
2410 }
2411
2412 /*
2413   sysfs
2414 */
2415
2416 static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2417 {
2418         return container_of(dev, struct rbd_device, dev);
2419 }
2420
2421 static ssize_t rbd_size_show(struct device *dev,
2422                              struct device_attribute *attr, char *buf)
2423 {
2424         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2425         sector_t size;
2426
2427         down_read(&rbd_dev->header_rwsem);
2428         size = get_capacity(rbd_dev->disk);
2429         up_read(&rbd_dev->header_rwsem);
2430
2431         return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
2432 }
2433
2434 /*
2435  * Note this shows the features for whatever's mapped, which is not
2436  * necessarily the base image.
2437  */
2438 static ssize_t rbd_features_show(struct device *dev,
2439                              struct device_attribute *attr, char *buf)
2440 {
2441         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2442
2443         return sprintf(buf, "0x%016llx\n",
2444                         (unsigned long long) rbd_dev->mapping.features);
2445 }
2446
2447 static ssize_t rbd_major_show(struct device *dev,
2448                               struct device_attribute *attr, char *buf)
2449 {
2450         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2451
2452         return sprintf(buf, "%d\n", rbd_dev->major);
2453 }
2454
2455 static ssize_t rbd_client_id_show(struct device *dev,
2456                                   struct device_attribute *attr, char *buf)
2457 {
2458         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2459
2460         return sprintf(buf, "client%lld\n",
2461                         ceph_client_id(rbd_dev->rbd_client->client));
2462 }
2463
2464 static ssize_t rbd_pool_show(struct device *dev,
2465                              struct device_attribute *attr, char *buf)
2466 {
2467         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2468
2469         return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
2470 }
2471
2472 static ssize_t rbd_pool_id_show(struct device *dev,
2473                              struct device_attribute *attr, char *buf)
2474 {
2475         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2476
2477         return sprintf(buf, "%llu\n",
2478                 (unsigned long long) rbd_dev->spec->pool_id);
2479 }
2480
2481 static ssize_t rbd_name_show(struct device *dev,
2482                              struct device_attribute *attr, char *buf)
2483 {
2484         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2485
2486         if (rbd_dev->spec->image_name)
2487                 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2488
2489         return sprintf(buf, "(unknown)\n");
2490 }
2491
2492 static ssize_t rbd_image_id_show(struct device *dev,
2493                              struct device_attribute *attr, char *buf)
2494 {
2495         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2496
2497         return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
2498 }
2499
2500 /*
2501  * Shows the name of the currently-mapped snapshot (or
2502  * RBD_SNAP_HEAD_NAME for the base image).
2503  */
2504 static ssize_t rbd_snap_show(struct device *dev,
2505                              struct device_attribute *attr,
2506                              char *buf)
2507 {
2508         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2509
2510         return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2511 }
2512
2513 /*
2514  * For an rbd v2 image, shows the pool id, image id, and snapshot id
2515  * for the parent image.  If there is no parent, simply shows
2516  * "(no parent image)".
2517  */
2518 static ssize_t rbd_parent_show(struct device *dev,
2519                              struct device_attribute *attr,
2520                              char *buf)
2521 {
2522         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2523         struct rbd_spec *spec = rbd_dev->parent_spec;
2524         int count;
2525         char *bufp = buf;
2526
2527         if (!spec)
2528                 return sprintf(buf, "(no parent image)\n");
2529
2530         count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2531                         (unsigned long long) spec->pool_id, spec->pool_name);
2532         if (count < 0)
2533                 return count;
2534         bufp += count;
2535
2536         count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2537                         spec->image_name ? spec->image_name : "(unknown)");
2538         if (count < 0)
2539                 return count;
2540         bufp += count;
2541
2542         count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2543                         (unsigned long long) spec->snap_id, spec->snap_name);
2544         if (count < 0)
2545                 return count;
2546         bufp += count;
2547
2548         count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2549         if (count < 0)
2550                 return count;
2551         bufp += count;
2552
2553         return (ssize_t) (bufp - buf);
2554 }
2555
2556 static ssize_t rbd_image_refresh(struct device *dev,
2557                                  struct device_attribute *attr,
2558                                  const char *buf,
2559                                  size_t size)
2560 {
2561         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2562         int ret;
2563
2564         ret = rbd_dev_refresh(rbd_dev, NULL);
2565
2566         return ret < 0 ? ret : size;
2567 }
2568
2569 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
2570 static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2571 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2572 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2573 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
2574 static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2575 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2576 static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2577 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2578 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2579 static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
2580
2581 static struct attribute *rbd_attrs[] = {
2582         &dev_attr_size.attr,
2583         &dev_attr_features.attr,
2584         &dev_attr_major.attr,
2585         &dev_attr_client_id.attr,
2586         &dev_attr_pool.attr,
2587         &dev_attr_pool_id.attr,
2588         &dev_attr_name.attr,
2589         &dev_attr_image_id.attr,
2590         &dev_attr_current_snap.attr,
2591         &dev_attr_parent.attr,
2592         &dev_attr_refresh.attr,
2593         NULL
2594 };
2595
2596 static struct attribute_group rbd_attr_group = {
2597         .attrs = rbd_attrs,
2598 };
2599
2600 static const struct attribute_group *rbd_attr_groups[] = {
2601         &rbd_attr_group,
2602         NULL
2603 };
2604
2605 static void rbd_sysfs_dev_release(struct device *dev)
2606 {
2607 }
2608
2609 static struct device_type rbd_device_type = {
2610         .name           = "rbd",
2611         .groups         = rbd_attr_groups,
2612         .release        = rbd_sysfs_dev_release,
2613 };
2614
2615
2616 /*
2617   sysfs - snapshots
2618 */
2619
2620 static ssize_t rbd_snap_size_show(struct device *dev,
2621                                   struct device_attribute *attr,
2622                                   char *buf)
2623 {
2624         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2625
2626         return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2627 }
2628
2629 static ssize_t rbd_snap_id_show(struct device *dev,
2630                                 struct device_attribute *attr,
2631                                 char *buf)
2632 {
2633         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2634
2635         return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2636 }
2637
2638 static ssize_t rbd_snap_features_show(struct device *dev,
2639                                 struct device_attribute *attr,
2640                                 char *buf)
2641 {
2642         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2643
2644         return sprintf(buf, "0x%016llx\n",
2645                         (unsigned long long) snap->features);
2646 }
2647
2648 static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2649 static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2650 static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2651
2652 static struct attribute *rbd_snap_attrs[] = {
2653         &dev_attr_snap_size.attr,
2654         &dev_attr_snap_id.attr,
2655         &dev_attr_snap_features.attr,
2656         NULL,
2657 };
2658
2659 static struct attribute_group rbd_snap_attr_group = {
2660         .attrs = rbd_snap_attrs,
2661 };
2662
2663 static void rbd_snap_dev_release(struct device *dev)
2664 {
2665         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2666         kfree(snap->name);
2667         kfree(snap);
2668 }
2669
2670 static const struct attribute_group *rbd_snap_attr_groups[] = {
2671         &rbd_snap_attr_group,
2672         NULL
2673 };
2674
2675 static struct device_type rbd_snap_device_type = {
2676         .groups         = rbd_snap_attr_groups,
2677         .release        = rbd_snap_dev_release,
2678 };
2679
2680 static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2681 {
2682         kref_get(&spec->kref);
2683
2684         return spec;
2685 }
2686
2687 static void rbd_spec_free(struct kref *kref);
2688 static void rbd_spec_put(struct rbd_spec *spec)
2689 {
2690         if (spec)
2691                 kref_put(&spec->kref, rbd_spec_free);
2692 }
2693
2694 static struct rbd_spec *rbd_spec_alloc(void)
2695 {
2696         struct rbd_spec *spec;
2697
2698         spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2699         if (!spec)
2700                 return NULL;
2701         kref_init(&spec->kref);
2702
2703         rbd_spec_put(rbd_spec_get(spec));       /* TEMPORARY */
2704
2705         return spec;
2706 }
2707
2708 static void rbd_spec_free(struct kref *kref)
2709 {
2710         struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2711
2712         kfree(spec->pool_name);
2713         kfree(spec->image_id);
2714         kfree(spec->image_name);
2715         kfree(spec->snap_name);
2716         kfree(spec);
2717 }
2718
2719 static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2720                                 struct rbd_spec *spec)
2721 {
2722         struct rbd_device *rbd_dev;
2723
2724         rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2725         if (!rbd_dev)
2726                 return NULL;
2727
2728         spin_lock_init(&rbd_dev->lock);
2729         rbd_dev->flags = 0;
2730         INIT_LIST_HEAD(&rbd_dev->node);
2731         INIT_LIST_HEAD(&rbd_dev->snaps);
2732         init_rwsem(&rbd_dev->header_rwsem);
2733
2734         rbd_dev->spec = spec;
2735         rbd_dev->rbd_client = rbdc;
2736
2737         /* Initialize the layout used for all rbd requests */
2738
2739         rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2740         rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2741         rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2742         rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2743
2744         return rbd_dev;
2745 }
2746
2747 static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2748 {
2749         rbd_spec_put(rbd_dev->parent_spec);
2750         kfree(rbd_dev->header_name);
2751         rbd_put_client(rbd_dev->rbd_client);
2752         rbd_spec_put(rbd_dev->spec);
2753         kfree(rbd_dev);
2754 }
2755
2756 static bool rbd_snap_registered(struct rbd_snap *snap)
2757 {
2758         bool ret = snap->dev.type == &rbd_snap_device_type;
2759         bool reg = device_is_registered(&snap->dev);
2760
2761         rbd_assert(!ret ^ reg);
2762
2763         return ret;
2764 }
2765
2766 static void rbd_remove_snap_dev(struct rbd_snap *snap)
2767 {
2768         list_del(&snap->node);
2769         if (device_is_registered(&snap->dev))
2770                 device_unregister(&snap->dev);
2771 }
2772
2773 static int rbd_register_snap_dev(struct rbd_snap *snap,
2774                                   struct device *parent)
2775 {
2776         struct device *dev = &snap->dev;
2777         int ret;
2778
2779         dev->type = &rbd_snap_device_type;
2780         dev->parent = parent;
2781         dev->release = rbd_snap_dev_release;
2782         dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2783         dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2784
2785         ret = device_register(dev);
2786
2787         return ret;
2788 }
2789
2790 static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2791                                                 const char *snap_name,
2792                                                 u64 snap_id, u64 snap_size,
2793                                                 u64 snap_features)
2794 {
2795         struct rbd_snap *snap;
2796         int ret;
2797
2798         snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2799         if (!snap)
2800                 return ERR_PTR(-ENOMEM);
2801
2802         ret = -ENOMEM;
2803         snap->name = kstrdup(snap_name, GFP_KERNEL);
2804         if (!snap->name)
2805                 goto err;
2806
2807         snap->id = snap_id;
2808         snap->size = snap_size;
2809         snap->features = snap_features;
2810
2811         return snap;
2812
2813 err:
2814         kfree(snap->name);
2815         kfree(snap);
2816
2817         return ERR_PTR(ret);
2818 }
2819
2820 static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2821                 u64 *snap_size, u64 *snap_features)
2822 {
2823         char *snap_name;
2824
2825         rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2826
2827         *snap_size = rbd_dev->header.snap_sizes[which];
2828         *snap_features = 0;     /* No features for v1 */
2829
2830         /* Skip over names until we find the one we are looking for */
2831
2832         snap_name = rbd_dev->header.snap_names;
2833         while (which--)
2834                 snap_name += strlen(snap_name) + 1;
2835
2836         return snap_name;
2837 }
2838
2839 /*
2840  * Get the size and object order for an image snapshot, or if
2841  * snap_id is CEPH_NOSNAP, gets this information for the base
2842  * image.
2843  */
2844 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2845                                 u8 *order, u64 *snap_size)
2846 {
2847         __le64 snapid = cpu_to_le64(snap_id);
2848         int ret;
2849         struct {
2850                 u8 order;
2851                 __le64 size;
2852         } __attribute__ ((packed)) size_buf = { 0 };
2853
2854         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2855                                 "rbd", "get_size",
2856                                 (char *) &snapid, sizeof (snapid),
2857                                 (char *) &size_buf, sizeof (size_buf), NULL);
2858         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2859         if (ret < 0)
2860                 return ret;
2861
2862         *order = size_buf.order;
2863         *snap_size = le64_to_cpu(size_buf.size);
2864
2865         dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
2866                 (unsigned long long) snap_id, (unsigned int) *order,
2867                 (unsigned long long) *snap_size);
2868
2869         return 0;
2870 }
2871
2872 static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2873 {
2874         return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2875                                         &rbd_dev->header.obj_order,
2876                                         &rbd_dev->header.image_size);
2877 }
2878
2879 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2880 {
2881         void *reply_buf;
2882         int ret;
2883         void *p;
2884
2885         reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2886         if (!reply_buf)
2887                 return -ENOMEM;
2888
2889         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2890                                 "rbd", "get_object_prefix",
2891                                 NULL, 0,
2892                                 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
2893         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2894         if (ret < 0)
2895                 goto out;
2896
2897         p = reply_buf;
2898         rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2899                                                 p + RBD_OBJ_PREFIX_LEN_MAX,
2900                                                 NULL, GFP_NOIO);
2901
2902         if (IS_ERR(rbd_dev->header.object_prefix)) {
2903                 ret = PTR_ERR(rbd_dev->header.object_prefix);
2904                 rbd_dev->header.object_prefix = NULL;
2905         } else {
2906                 dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
2907         }
2908
2909 out:
2910         kfree(reply_buf);
2911
2912         return ret;
2913 }
2914
2915 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2916                 u64 *snap_features)
2917 {
2918         __le64 snapid = cpu_to_le64(snap_id);
2919         struct {
2920                 __le64 features;
2921                 __le64 incompat;
2922         } features_buf = { 0 };
2923         u64 incompat;
2924         int ret;
2925
2926         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2927                                 "rbd", "get_features",
2928                                 (char *) &snapid, sizeof (snapid),
2929                                 (char *) &features_buf, sizeof (features_buf),
2930                                 NULL);
2931         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2932         if (ret < 0)
2933                 return ret;
2934
2935         incompat = le64_to_cpu(features_buf.incompat);
2936         if (incompat & ~RBD_FEATURES_ALL)
2937                 return -ENXIO;
2938
2939         *snap_features = le64_to_cpu(features_buf.features);
2940
2941         dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2942                 (unsigned long long) snap_id,
2943                 (unsigned long long) *snap_features,
2944                 (unsigned long long) le64_to_cpu(features_buf.incompat));
2945
2946         return 0;
2947 }
2948
2949 static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2950 {
2951         return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2952                                                 &rbd_dev->header.features);
2953 }
2954
2955 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2956 {
2957         struct rbd_spec *parent_spec;
2958         size_t size;
2959         void *reply_buf = NULL;
2960         __le64 snapid;
2961         void *p;
2962         void *end;
2963         char *image_id;
2964         u64 overlap;
2965         int ret;
2966
2967         parent_spec = rbd_spec_alloc();
2968         if (!parent_spec)
2969                 return -ENOMEM;
2970
2971         size = sizeof (__le64) +                                /* pool_id */
2972                 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +        /* image_id */
2973                 sizeof (__le64) +                               /* snap_id */
2974                 sizeof (__le64);                                /* overlap */
2975         reply_buf = kmalloc(size, GFP_KERNEL);
2976         if (!reply_buf) {
2977                 ret = -ENOMEM;
2978                 goto out_err;
2979         }
2980
2981         snapid = cpu_to_le64(CEPH_NOSNAP);
2982         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2983                                 "rbd", "get_parent",
2984                                 (char *) &snapid, sizeof (snapid),
2985                                 (char *) reply_buf, size, NULL);
2986         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2987         if (ret < 0)
2988                 goto out_err;
2989
2990         ret = -ERANGE;
2991         p = reply_buf;
2992         end = (char *) reply_buf + size;
2993         ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2994         if (parent_spec->pool_id == CEPH_NOPOOL)
2995                 goto out;       /* No parent?  No problem. */
2996
2997         /* The ceph file layout needs to fit pool id in 32 bits */
2998
2999         ret = -EIO;
3000         if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
3001                 goto out;
3002
3003         image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3004         if (IS_ERR(image_id)) {
3005                 ret = PTR_ERR(image_id);
3006                 goto out_err;
3007         }
3008         parent_spec->image_id = image_id;
3009         ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3010         ceph_decode_64_safe(&p, end, overlap, out_err);
3011
3012         rbd_dev->parent_overlap = overlap;
3013         rbd_dev->parent_spec = parent_spec;
3014         parent_spec = NULL;     /* rbd_dev now owns this */
3015 out:
3016         ret = 0;
3017 out_err:
3018         kfree(reply_buf);
3019         rbd_spec_put(parent_spec);
3020
3021         return ret;
3022 }
3023
3024 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3025 {
3026         size_t image_id_size;
3027         char *image_id;
3028         void *p;
3029         void *end;
3030         size_t size;
3031         void *reply_buf = NULL;
3032         size_t len = 0;
3033         char *image_name = NULL;
3034         int ret;
3035
3036         rbd_assert(!rbd_dev->spec->image_name);
3037
3038         len = strlen(rbd_dev->spec->image_id);
3039         image_id_size = sizeof (__le32) + len;
3040         image_id = kmalloc(image_id_size, GFP_KERNEL);
3041         if (!image_id)
3042                 return NULL;
3043
3044         p = image_id;
3045         end = (char *) image_id + image_id_size;
3046         ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
3047
3048         size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3049         reply_buf = kmalloc(size, GFP_KERNEL);
3050         if (!reply_buf)
3051                 goto out;
3052
3053         ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
3054                                 "rbd", "dir_get_name",
3055                                 image_id, image_id_size,
3056                                 (char *) reply_buf, size, NULL);
3057         if (ret < 0)
3058                 goto out;
3059         p = reply_buf;
3060         end = (char *) reply_buf + size;
3061         image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3062         if (IS_ERR(image_name))
3063                 image_name = NULL;
3064         else
3065                 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3066 out:
3067         kfree(reply_buf);
3068         kfree(image_id);
3069
3070         return image_name;
3071 }
3072
3073 /*
3074  * When a parent image gets probed, we only have the pool, image,
3075  * and snapshot ids but not the names of any of them.  This call
3076  * is made later to fill in those names.  It has to be done after
3077  * rbd_dev_snaps_update() has completed because some of the
3078  * information (in particular, snapshot name) is not available
3079  * until then.
3080  */
3081 static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3082 {
3083         struct ceph_osd_client *osdc;
3084         const char *name;
3085         void *reply_buf = NULL;
3086         int ret;
3087
3088         if (rbd_dev->spec->pool_name)
3089                 return 0;       /* Already have the names */
3090
3091         /* Look up the pool name */
3092
3093         osdc = &rbd_dev->rbd_client->client->osdc;
3094         name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
3095         if (!name) {
3096                 rbd_warn(rbd_dev, "there is no pool with id %llu",
3097                         rbd_dev->spec->pool_id);        /* Really a BUG() */
3098                 return -EIO;
3099         }
3100
3101         rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3102         if (!rbd_dev->spec->pool_name)
3103                 return -ENOMEM;
3104
3105         /* Fetch the image name; tolerate failure here */
3106
3107         name = rbd_dev_image_name(rbd_dev);
3108         if (name)
3109                 rbd_dev->spec->image_name = (char *) name;
3110         else
3111                 rbd_warn(rbd_dev, "unable to get image name");
3112
3113         /* Look up the snapshot name. */
3114
3115         name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3116         if (!name) {
3117                 rbd_warn(rbd_dev, "no snapshot with id %llu",
3118                         rbd_dev->spec->snap_id);        /* Really a BUG() */
3119                 ret = -EIO;
3120                 goto out_err;
3121         }
3122         rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3123         if(!rbd_dev->spec->snap_name)
3124                 goto out_err;
3125
3126         return 0;
3127 out_err:
3128         kfree(reply_buf);
3129         kfree(rbd_dev->spec->pool_name);
3130         rbd_dev->spec->pool_name = NULL;
3131
3132         return ret;
3133 }
3134
3135 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
3136 {
3137         size_t size;
3138         int ret;
3139         void *reply_buf;
3140         void *p;
3141         void *end;
3142         u64 seq;
3143         u32 snap_count;
3144         struct ceph_snap_context *snapc;
3145         u32 i;
3146
3147         /*
3148          * We'll need room for the seq value (maximum snapshot id),
3149          * snapshot count, and array of that many snapshot ids.
3150          * For now we have a fixed upper limit on the number we're
3151          * prepared to receive.
3152          */
3153         size = sizeof (__le64) + sizeof (__le32) +
3154                         RBD_MAX_SNAP_COUNT * sizeof (__le64);
3155         reply_buf = kzalloc(size, GFP_KERNEL);
3156         if (!reply_buf)
3157                 return -ENOMEM;
3158
3159         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3160                                 "rbd", "get_snapcontext",
3161                                 NULL, 0,
3162                                 reply_buf, size, ver);
3163         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3164         if (ret < 0)
3165                 goto out;
3166
3167         ret = -ERANGE;
3168         p = reply_buf;
3169         end = (char *) reply_buf + size;
3170         ceph_decode_64_safe(&p, end, seq, out);
3171         ceph_decode_32_safe(&p, end, snap_count, out);
3172
3173         /*
3174          * Make sure the reported number of snapshot ids wouldn't go
3175          * beyond the end of our buffer.  But before checking that,
3176          * make sure the computed size of the snapshot context we
3177          * allocate is representable in a size_t.
3178          */
3179         if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3180                                  / sizeof (u64)) {
3181                 ret = -EINVAL;
3182                 goto out;
3183         }
3184         if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3185                 goto out;
3186
3187         size = sizeof (struct ceph_snap_context) +
3188                                 snap_count * sizeof (snapc->snaps[0]);
3189         snapc = kmalloc(size, GFP_KERNEL);
3190         if (!snapc) {
3191                 ret = -ENOMEM;
3192                 goto out;
3193         }
3194
3195         atomic_set(&snapc->nref, 1);
3196         snapc->seq = seq;
3197         snapc->num_snaps = snap_count;
3198         for (i = 0; i < snap_count; i++)
3199                 snapc->snaps[i] = ceph_decode_64(&p);
3200
3201         rbd_dev->header.snapc = snapc;
3202
3203         dout("  snap context seq = %llu, snap_count = %u\n",
3204                 (unsigned long long) seq, (unsigned int) snap_count);
3205
3206 out:
3207         kfree(reply_buf);
3208
3209         return 0;
3210 }
3211
3212 static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3213 {
3214         size_t size;
3215         void *reply_buf;
3216         __le64 snap_id;
3217         int ret;
3218         void *p;
3219         void *end;
3220         char *snap_name;
3221
3222         size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3223         reply_buf = kmalloc(size, GFP_KERNEL);
3224         if (!reply_buf)
3225                 return ERR_PTR(-ENOMEM);
3226
3227         snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
3228         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3229                                 "rbd", "get_snapshot_name",
3230                                 (char *) &snap_id, sizeof (snap_id),
3231                                 reply_buf, size, NULL);
3232         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3233         if (ret < 0)
3234                 goto out;
3235
3236         p = reply_buf;
3237         end = (char *) reply_buf + size;
3238         snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3239         if (IS_ERR(snap_name)) {
3240                 ret = PTR_ERR(snap_name);
3241                 goto out;
3242         } else {
3243                 dout("  snap_id 0x%016llx snap_name = %s\n",
3244                         (unsigned long long) le64_to_cpu(snap_id), snap_name);
3245         }
3246         kfree(reply_buf);
3247
3248         return snap_name;
3249 out:
3250         kfree(reply_buf);
3251
3252         return ERR_PTR(ret);
3253 }
3254
3255 static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3256                 u64 *snap_size, u64 *snap_features)
3257 {
3258         u64 snap_id;
3259         u8 order;
3260         int ret;
3261
3262         snap_id = rbd_dev->header.snapc->snaps[which];
3263         ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3264         if (ret)
3265                 return ERR_PTR(ret);
3266         ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3267         if (ret)
3268                 return ERR_PTR(ret);
3269
3270         return rbd_dev_v2_snap_name(rbd_dev, which);
3271 }
3272
3273 static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3274                 u64 *snap_size, u64 *snap_features)
3275 {
3276         if (rbd_dev->image_format == 1)
3277                 return rbd_dev_v1_snap_info(rbd_dev, which,
3278                                         snap_size, snap_features);
3279         if (rbd_dev->image_format == 2)
3280                 return rbd_dev_v2_snap_info(rbd_dev, which,
3281                                         snap_size, snap_features);
3282         return ERR_PTR(-EINVAL);
3283 }
3284
3285 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3286 {
3287         int ret;
3288         __u8 obj_order;
3289
3290         down_write(&rbd_dev->header_rwsem);
3291
3292         /* Grab old order first, to see if it changes */
3293
3294         obj_order = rbd_dev->header.obj_order,
3295         ret = rbd_dev_v2_image_size(rbd_dev);
3296         if (ret)
3297                 goto out;
3298         if (rbd_dev->header.obj_order != obj_order) {
3299                 ret = -EIO;
3300                 goto out;
3301         }
3302         rbd_update_mapping_size(rbd_dev);
3303
3304         ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3305         dout("rbd_dev_v2_snap_context returned %d\n", ret);
3306         if (ret)
3307                 goto out;
3308         ret = rbd_dev_snaps_update(rbd_dev);
3309         dout("rbd_dev_snaps_update returned %d\n", ret);
3310         if (ret)
3311                 goto out;
3312         ret = rbd_dev_snaps_register(rbd_dev);
3313         dout("rbd_dev_snaps_register returned %d\n", ret);
3314 out:
3315         up_write(&rbd_dev->header_rwsem);
3316
3317         return ret;
3318 }
3319
3320 /*
3321  * Scan the rbd device's current snapshot list and compare it to the
3322  * newly-received snapshot context.  Remove any existing snapshots
3323  * not present in the new snapshot context.  Add a new snapshot for
3324  * any snaphots in the snapshot context not in the current list.
3325  * And verify there are no changes to snapshots we already know
3326  * about.
3327  *
3328  * Assumes the snapshots in the snapshot context are sorted by
3329  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
3330  * are also maintained in that order.)
3331  */
3332 static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
3333 {
3334         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3335         const u32 snap_count = snapc->num_snaps;
3336         struct list_head *head = &rbd_dev->snaps;
3337         struct list_head *links = head->next;
3338         u32 index = 0;
3339
3340         dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
3341         while (index < snap_count || links != head) {
3342                 u64 snap_id;
3343                 struct rbd_snap *snap;
3344                 char *snap_name;
3345                 u64 snap_size = 0;
3346                 u64 snap_features = 0;
3347
3348                 snap_id = index < snap_count ? snapc->snaps[index]
3349                                              : CEPH_NOSNAP;
3350                 snap = links != head ? list_entry(links, struct rbd_snap, node)
3351                                      : NULL;
3352                 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
3353
3354                 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3355                         struct list_head *next = links->next;
3356
3357                         /*
3358                          * A previously-existing snapshot is not in
3359                          * the new snap context.
3360                          *
3361                          * If the now missing snapshot is the one the
3362                          * image is mapped to, clear its exists flag
3363                          * so we can avoid sending any more requests
3364                          * to it.
3365                          */
3366                         if (rbd_dev->spec->snap_id == snap->id)
3367                                 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3368                         rbd_remove_snap_dev(snap);
3369                         dout("%ssnap id %llu has been removed\n",
3370                                 rbd_dev->spec->snap_id == snap->id ?
3371                                                         "mapped " : "",
3372                                 (unsigned long long) snap->id);
3373
3374                         /* Done with this list entry; advance */
3375
3376                         links = next;
3377                         continue;
3378                 }
3379
3380                 snap_name = rbd_dev_snap_info(rbd_dev, index,
3381                                         &snap_size, &snap_features);
3382                 if (IS_ERR(snap_name))
3383                         return PTR_ERR(snap_name);
3384
3385                 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3386                         (unsigned long long) snap_id);
3387                 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3388                         struct rbd_snap *new_snap;
3389
3390                         /* We haven't seen this snapshot before */
3391
3392                         new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
3393                                         snap_id, snap_size, snap_features);
3394                         if (IS_ERR(new_snap)) {
3395                                 int err = PTR_ERR(new_snap);
3396
3397                                 dout("  failed to add dev, error %d\n", err);
3398
3399                                 return err;
3400                         }
3401
3402                         /* New goes before existing, or at end of list */
3403
3404                         dout("  added dev%s\n", snap ? "" : " at end\n");
3405                         if (snap)
3406                                 list_add_tail(&new_snap->node, &snap->node);
3407                         else
3408                                 list_add_tail(&new_snap->node, head);
3409                 } else {
3410                         /* Already have this one */
3411
3412                         dout("  already present\n");
3413
3414                         rbd_assert(snap->size == snap_size);
3415                         rbd_assert(!strcmp(snap->name, snap_name));
3416                         rbd_assert(snap->features == snap_features);
3417
3418                         /* Done with this list entry; advance */
3419
3420                         links = links->next;
3421                 }
3422
3423                 /* Advance to the next entry in the snapshot context */
3424
3425                 index++;
3426         }
3427         dout("%s: done\n", __func__);
3428
3429         return 0;
3430 }
3431
3432 /*
3433  * Scan the list of snapshots and register the devices for any that
3434  * have not already been registered.
3435  */
3436 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3437 {
3438         struct rbd_snap *snap;
3439         int ret = 0;
3440
3441         dout("%s:\n", __func__);
3442         if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3443                 return -EIO;
3444
3445         list_for_each_entry(snap, &rbd_dev->snaps, node) {
3446                 if (!rbd_snap_registered(snap)) {
3447                         ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3448                         if (ret < 0)
3449                                 break;
3450                 }
3451         }
3452         dout("%s: returning %d\n", __func__, ret);
3453
3454         return ret;
3455 }
3456
3457 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3458 {
3459         struct device *dev;
3460         int ret;
3461
3462         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3463
3464         dev = &rbd_dev->dev;
3465         dev->bus = &rbd_bus_type;
3466         dev->type = &rbd_device_type;
3467         dev->parent = &rbd_root_dev;
3468         dev->release = rbd_dev_release;
3469         dev_set_name(dev, "%d", rbd_dev->dev_id);
3470         ret = device_register(dev);
3471
3472         mutex_unlock(&ctl_mutex);
3473
3474         return ret;
3475 }
3476
3477 static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3478 {
3479         device_unregister(&rbd_dev->dev);
3480 }
3481
3482 static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
3483
3484 /*
3485  * Get a unique rbd identifier for the given new rbd_dev, and add
3486  * the rbd_dev to the global list.  The minimum rbd id is 1.
3487  */
3488 static void rbd_dev_id_get(struct rbd_device *rbd_dev)
3489 {
3490         rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
3491
3492         spin_lock(&rbd_dev_list_lock);
3493         list_add_tail(&rbd_dev->node, &rbd_dev_list);
3494         spin_unlock(&rbd_dev_list_lock);
3495         dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3496                 (unsigned long long) rbd_dev->dev_id);
3497 }
3498
3499 /*
3500  * Remove an rbd_dev from the global list, and record that its
3501  * identifier is no longer in use.
3502  */
3503 static void rbd_dev_id_put(struct rbd_device *rbd_dev)
3504 {
3505         struct list_head *tmp;
3506         int rbd_id = rbd_dev->dev_id;
3507         int max_id;
3508
3509         rbd_assert(rbd_id > 0);
3510
3511         dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3512                 (unsigned long long) rbd_dev->dev_id);
3513         spin_lock(&rbd_dev_list_lock);
3514         list_del_init(&rbd_dev->node);
3515
3516         /*
3517          * If the id being "put" is not the current maximum, there
3518          * is nothing special we need to do.
3519          */
3520         if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
3521                 spin_unlock(&rbd_dev_list_lock);
3522                 return;
3523         }
3524
3525         /*
3526          * We need to update the current maximum id.  Search the
3527          * list to find out what it is.  We're more likely to find
3528          * the maximum at the end, so search the list backward.
3529          */
3530         max_id = 0;
3531         list_for_each_prev(tmp, &rbd_dev_list) {
3532                 struct rbd_device *rbd_dev;
3533
3534                 rbd_dev = list_entry(tmp, struct rbd_device, node);
3535                 if (rbd_dev->dev_id > max_id)
3536                         max_id = rbd_dev->dev_id;
3537         }
3538         spin_unlock(&rbd_dev_list_lock);
3539
3540         /*
3541          * The max id could have been updated by rbd_dev_id_get(), in
3542          * which case it now accurately reflects the new maximum.
3543          * Be careful not to overwrite the maximum value in that
3544          * case.
3545          */
3546         atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3547         dout("  max dev id has been reset\n");
3548 }
3549
3550 /*
3551  * Skips over white space at *buf, and updates *buf to point to the
3552  * first found non-space character (if any). Returns the length of
3553  * the token (string of non-white space characters) found.  Note
3554  * that *buf must be terminated with '\0'.
3555  */
3556 static inline size_t next_token(const char **buf)
3557 {
3558         /*
3559         * These are the characters that produce nonzero for
3560         * isspace() in the "C" and "POSIX" locales.
3561         */
3562         const char *spaces = " \f\n\r\t\v";
3563
3564         *buf += strspn(*buf, spaces);   /* Find start of token */
3565
3566         return strcspn(*buf, spaces);   /* Return token length */
3567 }
3568
3569 /*
3570  * Finds the next token in *buf, and if the provided token buffer is
3571  * big enough, copies the found token into it.  The result, if
3572  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
3573  * must be terminated with '\0' on entry.
3574  *
3575  * Returns the length of the token found (not including the '\0').
3576  * Return value will be 0 if no token is found, and it will be >=
3577  * token_size if the token would not fit.
3578  *
3579  * The *buf pointer will be updated to point beyond the end of the
3580  * found token.  Note that this occurs even if the token buffer is
3581  * too small to hold it.
3582  */
3583 static inline size_t copy_token(const char **buf,
3584                                 char *token,
3585                                 size_t token_size)
3586 {
3587         size_t len;
3588
3589         len = next_token(buf);
3590         if (len < token_size) {
3591                 memcpy(token, *buf, len);
3592                 *(token + len) = '\0';
3593         }
3594         *buf += len;
3595
3596         return len;
3597 }
3598
3599 /*
3600  * Finds the next token in *buf, dynamically allocates a buffer big
3601  * enough to hold a copy of it, and copies the token into the new
3602  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
3603  * that a duplicate buffer is created even for a zero-length token.
3604  *
3605  * Returns a pointer to the newly-allocated duplicate, or a null
3606  * pointer if memory for the duplicate was not available.  If
3607  * the lenp argument is a non-null pointer, the length of the token
3608  * (not including the '\0') is returned in *lenp.
3609  *
3610  * If successful, the *buf pointer will be updated to point beyond
3611  * the end of the found token.
3612  *
3613  * Note: uses GFP_KERNEL for allocation.
3614  */
3615 static inline char *dup_token(const char **buf, size_t *lenp)
3616 {
3617         char *dup;
3618         size_t len;
3619
3620         len = next_token(buf);
3621         dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3622         if (!dup)
3623                 return NULL;
3624         *(dup + len) = '\0';
3625         *buf += len;
3626
3627         if (lenp)
3628                 *lenp = len;
3629
3630         return dup;
3631 }
3632
3633 /*
3634  * Parse the options provided for an "rbd add" (i.e., rbd image
3635  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
3636  * and the data written is passed here via a NUL-terminated buffer.
3637  * Returns 0 if successful or an error code otherwise.
3638  *
3639  * The information extracted from these options is recorded in
3640  * the other parameters which return dynamically-allocated
3641  * structures:
3642  *  ceph_opts
3643  *      The address of a pointer that will refer to a ceph options
3644  *      structure.  Caller must release the returned pointer using
3645  *      ceph_destroy_options() when it is no longer needed.
3646  *  rbd_opts
3647  *      Address of an rbd options pointer.  Fully initialized by
3648  *      this function; caller must release with kfree().
3649  *  spec
3650  *      Address of an rbd image specification pointer.  Fully
3651  *      initialized by this function based on parsed options.
3652  *      Caller must release with rbd_spec_put().
3653  *
3654  * The options passed take this form:
3655  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3656  * where:
3657  *  <mon_addrs>
3658  *      A comma-separated list of one or more monitor addresses.
3659  *      A monitor address is an ip address, optionally followed
3660  *      by a port number (separated by a colon).
3661  *        I.e.:  ip1[:port1][,ip2[:port2]...]
3662  *  <options>
3663  *      A comma-separated list of ceph and/or rbd options.
3664  *  <pool_name>
3665  *      The name of the rados pool containing the rbd image.
3666  *  <image_name>
3667  *      The name of the image in that pool to map.
3668  *  <snap_id>
3669  *      An optional snapshot id.  If provided, the mapping will
3670  *      present data from the image at the time that snapshot was
3671  *      created.  The image head is used if no snapshot id is
3672  *      provided.  Snapshot mappings are always read-only.
3673  */
3674 static int rbd_add_parse_args(const char *buf,
3675                                 struct ceph_options **ceph_opts,
3676                                 struct rbd_options **opts,
3677                                 struct rbd_spec **rbd_spec)
3678 {
3679         size_t len;
3680         char *options;
3681         const char *mon_addrs;
3682         size_t mon_addrs_size;
3683         struct rbd_spec *spec = NULL;
3684         struct rbd_options *rbd_opts = NULL;
3685         struct ceph_options *copts;
3686         int ret;
3687
3688         /* The first four tokens are required */
3689
3690         len = next_token(&buf);
3691         if (!len) {
3692                 rbd_warn(NULL, "no monitor address(es) provided");
3693                 return -EINVAL;
3694         }
3695         mon_addrs = buf;
3696         mon_addrs_size = len + 1;
3697         buf += len;
3698
3699         ret = -EINVAL;
3700         options = dup_token(&buf, NULL);
3701         if (!options)
3702                 return -ENOMEM;
3703         if (!*options) {
3704                 rbd_warn(NULL, "no options provided");
3705                 goto out_err;
3706         }
3707
3708         spec = rbd_spec_alloc();
3709         if (!spec)
3710                 goto out_mem;
3711
3712         spec->pool_name = dup_token(&buf, NULL);
3713         if (!spec->pool_name)
3714                 goto out_mem;
3715         if (!*spec->pool_name) {
3716                 rbd_warn(NULL, "no pool name provided");
3717                 goto out_err;
3718         }
3719
3720         spec->image_name = dup_token(&buf, NULL);
3721         if (!spec->image_name)
3722                 goto out_mem;
3723         if (!*spec->image_name) {
3724                 rbd_warn(NULL, "no image name provided");
3725                 goto out_err;
3726         }
3727
3728         /*
3729          * Snapshot name is optional; default is to use "-"
3730          * (indicating the head/no snapshot).
3731          */
3732         len = next_token(&buf);
3733         if (!len) {
3734                 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3735                 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
3736         } else if (len > RBD_MAX_SNAP_NAME_LEN) {
3737                 ret = -ENAMETOOLONG;
3738                 goto out_err;
3739         }
3740         spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3741         if (!spec->snap_name)
3742                 goto out_mem;
3743         *(spec->snap_name + len) = '\0';
3744
3745         /* Initialize all rbd options to the defaults */
3746
3747         rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3748         if (!rbd_opts)
3749                 goto out_mem;
3750
3751         rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
3752
3753         copts = ceph_parse_options(options, mon_addrs,
3754                                         mon_addrs + mon_addrs_size - 1,
3755                                         parse_rbd_opts_token, rbd_opts);
3756         if (IS_ERR(copts)) {
3757                 ret = PTR_ERR(copts);
3758                 goto out_err;
3759         }
3760         kfree(options);
3761
3762         *ceph_opts = copts;
3763         *opts = rbd_opts;
3764         *rbd_spec = spec;
3765
3766         return 0;
3767 out_mem:
3768         ret = -ENOMEM;
3769 out_err:
3770         kfree(rbd_opts);
3771         rbd_spec_put(spec);
3772         kfree(options);
3773
3774         return ret;
3775 }
3776
3777 /*
3778  * An rbd format 2 image has a unique identifier, distinct from the
3779  * name given to it by the user.  Internally, that identifier is
3780  * what's used to specify the names of objects related to the image.
3781  *
3782  * A special "rbd id" object is used to map an rbd image name to its
3783  * id.  If that object doesn't exist, then there is no v2 rbd image
3784  * with the supplied name.
3785  *
3786  * This function will record the given rbd_dev's image_id field if
3787  * it can be determined, and in that case will return 0.  If any
3788  * errors occur a negative errno will be returned and the rbd_dev's
3789  * image_id field will be unchanged (and should be NULL).
3790  */
3791 static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3792 {
3793         int ret;
3794         size_t size;
3795         char *object_name;
3796         void *response;
3797         void *p;
3798
3799         /*
3800          * When probing a parent image, the image id is already
3801          * known (and the image name likely is not).  There's no
3802          * need to fetch the image id again in this case.
3803          */
3804         if (rbd_dev->spec->image_id)
3805                 return 0;
3806
3807         /*
3808          * First, see if the format 2 image id file exists, and if
3809          * so, get the image's persistent id from it.
3810          */
3811         size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
3812         object_name = kmalloc(size, GFP_NOIO);
3813         if (!object_name)
3814                 return -ENOMEM;
3815         sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
3816         dout("rbd id object name is %s\n", object_name);
3817
3818         /* Response will be an encoded string, which includes a length */
3819
3820         size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3821         response = kzalloc(size, GFP_NOIO);
3822         if (!response) {
3823                 ret = -ENOMEM;
3824                 goto out;
3825         }
3826
3827         ret = rbd_obj_method_sync(rbd_dev, object_name,
3828                                 "rbd", "get_id",
3829                                 NULL, 0,
3830                                 response, RBD_IMAGE_ID_LEN_MAX, NULL);
3831         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3832         if (ret < 0)
3833                 goto out;
3834
3835         p = response;
3836         rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3837                                                 p + RBD_IMAGE_ID_LEN_MAX,
3838                                                 NULL, GFP_NOIO);
3839         if (IS_ERR(rbd_dev->spec->image_id)) {
3840                 ret = PTR_ERR(rbd_dev->spec->image_id);
3841                 rbd_dev->spec->image_id = NULL;
3842         } else {
3843                 dout("image_id is %s\n", rbd_dev->spec->image_id);
3844         }
3845 out:
3846         kfree(response);
3847         kfree(object_name);
3848
3849         return ret;
3850 }
3851
3852 static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3853 {
3854         int ret;
3855         size_t size;
3856
3857         /* Version 1 images have no id; empty string is used */
3858
3859         rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3860         if (!rbd_dev->spec->image_id)
3861                 return -ENOMEM;
3862
3863         /* Record the header object name for this rbd image. */
3864
3865         size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
3866         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3867         if (!rbd_dev->header_name) {
3868                 ret = -ENOMEM;
3869                 goto out_err;
3870         }
3871         sprintf(rbd_dev->header_name, "%s%s",
3872                 rbd_dev->spec->image_name, RBD_SUFFIX);
3873
3874         /* Populate rbd image metadata */
3875
3876         ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3877         if (ret < 0)
3878                 goto out_err;
3879
3880         /* Version 1 images have no parent (no layering) */
3881
3882         rbd_dev->parent_spec = NULL;
3883         rbd_dev->parent_overlap = 0;
3884
3885         rbd_dev->image_format = 1;
3886
3887         dout("discovered version 1 image, header name is %s\n",
3888                 rbd_dev->header_name);
3889
3890         return 0;
3891
3892 out_err:
3893         kfree(rbd_dev->header_name);
3894         rbd_dev->header_name = NULL;
3895         kfree(rbd_dev->spec->image_id);
3896         rbd_dev->spec->image_id = NULL;
3897
3898         return ret;
3899 }
3900
3901 static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3902 {
3903         size_t size;
3904         int ret;
3905         u64 ver = 0;
3906
3907         /*
3908          * Image id was filled in by the caller.  Record the header
3909          * object name for this rbd image.
3910          */
3911         size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
3912         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3913         if (!rbd_dev->header_name)
3914                 return -ENOMEM;
3915         sprintf(rbd_dev->header_name, "%s%s",
3916                         RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
3917
3918         /* Get the size and object order for the image */
3919
3920         ret = rbd_dev_v2_image_size(rbd_dev);
3921         if (ret < 0)
3922                 goto out_err;
3923
3924         /* Get the object prefix (a.k.a. block_name) for the image */
3925
3926         ret = rbd_dev_v2_object_prefix(rbd_dev);
3927         if (ret < 0)
3928                 goto out_err;
3929
3930         /* Get the and check features for the image */
3931
3932         ret = rbd_dev_v2_features(rbd_dev);
3933         if (ret < 0)
3934                 goto out_err;
3935
3936         /* If the image supports layering, get the parent info */
3937
3938         if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3939                 ret = rbd_dev_v2_parent_info(rbd_dev);
3940                 if (ret < 0)
3941                         goto out_err;
3942         }
3943
3944         /* crypto and compression type aren't (yet) supported for v2 images */
3945
3946         rbd_dev->header.crypt_type = 0;
3947         rbd_dev->header.comp_type = 0;
3948
3949         /* Get the snapshot context, plus the header version */
3950
3951         ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
3952         if (ret)
3953                 goto out_err;
3954         rbd_dev->header.obj_version = ver;
3955
3956         rbd_dev->image_format = 2;
3957
3958         dout("discovered version 2 image, header name is %s\n",
3959                 rbd_dev->header_name);
3960
3961         return 0;
3962 out_err:
3963         rbd_dev->parent_overlap = 0;
3964         rbd_spec_put(rbd_dev->parent_spec);
3965         rbd_dev->parent_spec = NULL;
3966         kfree(rbd_dev->header_name);
3967         rbd_dev->header_name = NULL;
3968         kfree(rbd_dev->header.object_prefix);
3969         rbd_dev->header.object_prefix = NULL;
3970
3971         return ret;
3972 }
3973
3974 static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3975 {
3976         int ret;
3977
3978         /* no need to lock here, as rbd_dev is not registered yet */
3979         ret = rbd_dev_snaps_update(rbd_dev);
3980         if (ret)
3981                 return ret;
3982
3983         ret = rbd_dev_probe_update_spec(rbd_dev);
3984         if (ret)
3985                 goto err_out_snaps;
3986
3987         ret = rbd_dev_set_mapping(rbd_dev);
3988         if (ret)
3989                 goto err_out_snaps;
3990
3991         /* generate unique id: find highest unique id, add one */
3992         rbd_dev_id_get(rbd_dev);
3993
3994         /* Fill in the device name, now that we have its id. */
3995         BUILD_BUG_ON(DEV_NAME_LEN
3996                         < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3997         sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3998
3999         /* Get our block major device number. */
4000
4001         ret = register_blkdev(0, rbd_dev->name);
4002         if (ret < 0)
4003                 goto err_out_id;
4004         rbd_dev->major = ret;
4005
4006         /* Set up the blkdev mapping. */
4007
4008         ret = rbd_init_disk(rbd_dev);
4009         if (ret)
4010                 goto err_out_blkdev;
4011
4012         ret = rbd_bus_add_dev(rbd_dev);
4013         if (ret)
4014                 goto err_out_disk;
4015
4016         /*
4017          * At this point cleanup in the event of an error is the job
4018          * of the sysfs code (initiated by rbd_bus_del_dev()).
4019          */
4020         down_write(&rbd_dev->header_rwsem);
4021         ret = rbd_dev_snaps_register(rbd_dev);
4022         up_write(&rbd_dev->header_rwsem);
4023         if (ret)
4024                 goto err_out_bus;
4025
4026         ret = rbd_dev_header_watch_sync(rbd_dev, 1);
4027         if (ret)
4028                 goto err_out_bus;
4029
4030         /* Everything's ready.  Announce the disk to the world. */
4031
4032         add_disk(rbd_dev->disk);
4033
4034         pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4035                 (unsigned long long) rbd_dev->mapping.size);
4036
4037         return ret;
4038 err_out_bus:
4039         /* this will also clean up rest of rbd_dev stuff */
4040
4041         rbd_bus_del_dev(rbd_dev);
4042
4043         return ret;
4044 err_out_disk:
4045         rbd_free_disk(rbd_dev);
4046 err_out_blkdev:
4047         unregister_blkdev(rbd_dev->major, rbd_dev->name);
4048 err_out_id:
4049         rbd_dev_id_put(rbd_dev);
4050 err_out_snaps:
4051         rbd_remove_all_snaps(rbd_dev);
4052
4053         return ret;
4054 }
4055
4056 /*
4057  * Probe for the existence of the header object for the given rbd
4058  * device.  For format 2 images this includes determining the image
4059  * id.
4060  */
4061 static int rbd_dev_probe(struct rbd_device *rbd_dev)
4062 {
4063         int ret;
4064
4065         /*
4066          * Get the id from the image id object.  If it's not a
4067          * format 2 image, we'll get ENOENT back, and we'll assume
4068          * it's a format 1 image.
4069          */
4070         ret = rbd_dev_image_id(rbd_dev);
4071         if (ret)
4072                 ret = rbd_dev_v1_probe(rbd_dev);
4073         else
4074                 ret = rbd_dev_v2_probe(rbd_dev);
4075         if (ret) {
4076                 dout("probe failed, returning %d\n", ret);
4077
4078                 return ret;
4079         }
4080
4081         ret = rbd_dev_probe_finish(rbd_dev);
4082         if (ret)
4083                 rbd_header_free(&rbd_dev->header);
4084
4085         return ret;
4086 }
4087
4088 static ssize_t rbd_add(struct bus_type *bus,
4089                        const char *buf,
4090                        size_t count)
4091 {
4092         struct rbd_device *rbd_dev = NULL;
4093         struct ceph_options *ceph_opts = NULL;
4094         struct rbd_options *rbd_opts = NULL;
4095         struct rbd_spec *spec = NULL;
4096         struct rbd_client *rbdc;
4097         struct ceph_osd_client *osdc;
4098         int rc = -ENOMEM;
4099
4100         if (!try_module_get(THIS_MODULE))
4101                 return -ENODEV;
4102
4103         /* parse add command */
4104         rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4105         if (rc < 0)
4106                 goto err_out_module;
4107
4108         rbdc = rbd_get_client(ceph_opts);
4109         if (IS_ERR(rbdc)) {
4110                 rc = PTR_ERR(rbdc);
4111                 goto err_out_args;
4112         }
4113         ceph_opts = NULL;       /* rbd_dev client now owns this */
4114
4115         /* pick the pool */
4116         osdc = &rbdc->client->osdc;
4117         rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4118         if (rc < 0)
4119                 goto err_out_client;
4120         spec->pool_id = (u64) rc;
4121
4122         /* The ceph file layout needs to fit pool id in 32 bits */
4123
4124         if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4125                 rc = -EIO;
4126                 goto err_out_client;
4127         }
4128
4129         rbd_dev = rbd_dev_create(rbdc, spec);
4130         if (!rbd_dev)
4131                 goto err_out_client;
4132         rbdc = NULL;            /* rbd_dev now owns this */
4133         spec = NULL;            /* rbd_dev now owns this */
4134
4135         rbd_dev->mapping.read_only = rbd_opts->read_only;
4136         kfree(rbd_opts);
4137         rbd_opts = NULL;        /* done with this */
4138
4139         rc = rbd_dev_probe(rbd_dev);
4140         if (rc < 0)
4141                 goto err_out_rbd_dev;
4142
4143         return count;
4144 err_out_rbd_dev:
4145         rbd_dev_destroy(rbd_dev);
4146 err_out_client:
4147         rbd_put_client(rbdc);
4148 err_out_args:
4149         if (ceph_opts)
4150                 ceph_destroy_options(ceph_opts);
4151         kfree(rbd_opts);
4152         rbd_spec_put(spec);
4153 err_out_module:
4154         module_put(THIS_MODULE);
4155
4156         dout("Error adding device %s\n", buf);
4157
4158         return (ssize_t) rc;
4159 }
4160
4161 static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4162 {
4163         struct list_head *tmp;
4164         struct rbd_device *rbd_dev;
4165
4166         spin_lock(&rbd_dev_list_lock);
4167         list_for_each(tmp, &rbd_dev_list) {
4168                 rbd_dev = list_entry(tmp, struct rbd_device, node);
4169                 if (rbd_dev->dev_id == dev_id) {
4170                         spin_unlock(&rbd_dev_list_lock);
4171                         return rbd_dev;
4172                 }
4173         }
4174         spin_unlock(&rbd_dev_list_lock);
4175         return NULL;
4176 }
4177
4178 static void rbd_dev_release(struct device *dev)
4179 {
4180         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4181
4182         if (rbd_dev->watch_event)
4183                 rbd_dev_header_watch_sync(rbd_dev, 0);
4184
4185         /* clean up and free blkdev */
4186         rbd_free_disk(rbd_dev);
4187         unregister_blkdev(rbd_dev->major, rbd_dev->name);
4188
4189         /* release allocated disk header fields */
4190         rbd_header_free(&rbd_dev->header);
4191
4192         /* done with the id, and with the rbd_dev */
4193         rbd_dev_id_put(rbd_dev);
4194         rbd_assert(rbd_dev->rbd_client != NULL);
4195         rbd_dev_destroy(rbd_dev);
4196
4197         /* release module ref */
4198         module_put(THIS_MODULE);
4199 }
4200
4201 static ssize_t rbd_remove(struct bus_type *bus,
4202                           const char *buf,
4203                           size_t count)
4204 {
4205         struct rbd_device *rbd_dev = NULL;
4206         int target_id, rc;
4207         unsigned long ul;
4208         int ret = count;
4209
4210         rc = strict_strtoul(buf, 10, &ul);
4211         if (rc)
4212                 return rc;
4213
4214         /* convert to int; abort if we lost anything in the conversion */
4215         target_id = (int) ul;
4216         if (target_id != ul)
4217                 return -EINVAL;
4218
4219         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4220
4221         rbd_dev = __rbd_get_dev(target_id);
4222         if (!rbd_dev) {
4223                 ret = -ENOENT;
4224                 goto done;
4225         }
4226
4227         spin_lock_irq(&rbd_dev->lock);
4228         if (rbd_dev->open_count)
4229                 ret = -EBUSY;
4230         else
4231                 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4232         spin_unlock_irq(&rbd_dev->lock);
4233         if (ret < 0)
4234                 goto done;
4235
4236         rbd_remove_all_snaps(rbd_dev);
4237         rbd_bus_del_dev(rbd_dev);
4238
4239 done:
4240         mutex_unlock(&ctl_mutex);
4241
4242         return ret;
4243 }
4244
4245 /*
4246  * create control files in sysfs
4247  * /sys/bus/rbd/...
4248  */
4249 static int rbd_sysfs_init(void)
4250 {
4251         int ret;
4252
4253         ret = device_register(&rbd_root_dev);
4254         if (ret < 0)
4255                 return ret;
4256
4257         ret = bus_register(&rbd_bus_type);
4258         if (ret < 0)
4259                 device_unregister(&rbd_root_dev);
4260
4261         return ret;
4262 }
4263
4264 static void rbd_sysfs_cleanup(void)
4265 {
4266         bus_unregister(&rbd_bus_type);
4267         device_unregister(&rbd_root_dev);
4268 }
4269
4270 static int __init rbd_init(void)
4271 {
4272         int rc;
4273
4274         if (!libceph_compatible(NULL)) {
4275                 rbd_warn(NULL, "libceph incompatibility (quitting)");
4276
4277                 return -EINVAL;
4278         }
4279         rc = rbd_sysfs_init();
4280         if (rc)
4281                 return rc;
4282         pr_info("loaded " RBD_DRV_NAME_LONG "\n");
4283         return 0;
4284 }
4285
4286 static void __exit rbd_exit(void)
4287 {
4288         rbd_sysfs_cleanup();
4289 }
4290
4291 module_init(rbd_init);
4292 module_exit(rbd_exit);
4293
4294 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4295 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4296 MODULE_DESCRIPTION("rados block device");
4297
4298 /* following authorship retained from original osdblk.c */
4299 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4300
4301 MODULE_LICENSE("GPL");