Pileus Git - ~andy/linux/blob - drivers/block/rbd.c

   1
   2 /*
   3    rbd.c -- Export ceph rados objects as a Linux block device
   4
   5
   6    based on drivers/block/osdblk.c:
   7
   8    Copyright 2009 Red Hat, Inc.
   9
  10    This program is free software; you can redistribute it and/or modify
  11    it under the terms of the GNU General Public License as published by
  12    the Free Software Foundation.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program; see the file COPYING.  If not, write to
  21    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  22
  23
  24
  25    For usage instructions, please refer to:
  26
  27                  Documentation/ABI/testing/sysfs-bus-rbd
  28
  29  */
  30
  31 #include <linux/ceph/libceph.h>
  32 #include <linux/ceph/osd_client.h>
  33 #include <linux/ceph/mon_client.h>
  34 #include <linux/ceph/decode.h>
  35 #include <linux/parser.h>
  36 #include <linux/bsearch.h>
  37
  38 #include <linux/kernel.h>
  39 #include <linux/device.h>
  40 #include <linux/module.h>
  41 #include <linux/fs.h>
  42 #include <linux/blkdev.h>
  43 #include <linux/slab.h>
  44
  45 #include "rbd_types.h"
  46
  47 #define RBD_DEBUG       /* Activate rbd_assert() calls */
  48
  49 /*
  50  * The basic unit of block I/O is a sector.  It is interpreted in a
  51  * number of contexts in Linux (blk, bio, genhd), but the default is
  52  * universally 512 bytes.  These symbols are just slightly more
  53  * meaningful than the bare numbers they represent.
  54  */
  55 #define SECTOR_SHIFT    9
  56 #define SECTOR_SIZE     (1ULL << SECTOR_SHIFT)
  57
  58 /*
  59  * Increment the given counter and return its updated value.
  60  * If the counter is already 0 it will not be incremented.
  61  * If the counter is already at its maximum value returns
  62  * -EINVAL without updating it.
  63  */
  64 static int atomic_inc_return_safe(atomic_t *v)
  65 {
  66         unsigned int counter;
  67
  68         counter = (unsigned int)__atomic_add_unless(v, 1, 0);
  69         if (counter <= (unsigned int)INT_MAX)
  70                 return (int)counter;
  71
  72         atomic_dec(v);
  73
  74         return -EINVAL;
  75 }
  76
  77 /* Decrement the counter.  Return the resulting value, or -EINVAL */
  78 static int atomic_dec_return_safe(atomic_t *v)
  79 {
  80         int counter;
  81
  82         counter = atomic_dec_return(v);
  83         if (counter >= 0)
  84                 return counter;
  85
  86         atomic_inc(v);
  87
  88         return -EINVAL;
  89 }
  90
  91 #define RBD_DRV_NAME "rbd"
  92 #define RBD_DRV_NAME_LONG "rbd (rados block device)"
  93
  94 #define RBD_MINORS_PER_MAJOR    256             /* max minors per blkdev */
  95
  96 #define RBD_SNAP_DEV_NAME_PREFIX        "snap_"
  97 #define RBD_MAX_SNAP_NAME_LEN   \
  98                         (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
  99
 100 #define RBD_MAX_SNAP_COUNT      510     /* allows max snapc to fit in 4KB */
 101
 102 #define RBD_SNAP_HEAD_NAME      "-"
 103
 104 #define BAD_SNAP_INDEX  U32_MAX         /* invalid index into snap array */
 105
 106 /* This allows a single page to hold an image name sent by OSD */
 107 #define RBD_IMAGE_NAME_LEN_MAX  (PAGE_SIZE - sizeof (__le32) - 1)
 108 #define RBD_IMAGE_ID_LEN_MAX    64
 109
 110 #define RBD_OBJ_PREFIX_LEN_MAX  64
 111
 112 /* Feature bits */
 113
 114 #define RBD_FEATURE_LAYERING    (1<<0)
 115 #define RBD_FEATURE_STRIPINGV2  (1<<1)
 116 #define RBD_FEATURES_ALL \
 117             (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
 118
 119 /* Features supported by this (client software) implementation. */
 120
 121 #define RBD_FEATURES_SUPPORTED  (RBD_FEATURES_ALL)
 122
 123 /*
 124  * An RBD device name will be "rbd#", where the "rbd" comes from
 125  * RBD_DRV_NAME above, and # is a unique integer identifier.
 126  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
 127  * enough to hold all possible device names.
 128  */
 129 #define DEV_NAME_LEN            32
 130 #define MAX_INT_FORMAT_WIDTH    ((5 * sizeof (int)) / 2 + 1)
 131
 132 /*
 133  * block device image metadata (in-memory version)
 134  */
 135 struct rbd_image_header {
 136         /* These six fields never change for a given rbd image */
 137         char *object_prefix;
 138         __u8 obj_order;
 139         __u8 crypt_type;
 140         __u8 comp_type;
 141         u64 stripe_unit;
 142         u64 stripe_count;
 143         u64 features;           /* Might be changeable someday? */
 144
 145         /* The remaining fields need to be updated occasionally */
 146         u64 image_size;
 147         struct ceph_snap_context *snapc;
 148         char *snap_names;       /* format 1 only */
 149         u64 *snap_sizes;        /* format 1 only */
 150 };
 151
 152 /*
 153  * An rbd image specification.
 154  *
 155  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
 156  * identify an image.  Each rbd_dev structure includes a pointer to
 157  * an rbd_spec structure that encapsulates this identity.
 158  *
 159  * Each of the id's in an rbd_spec has an associated name.  For a
 160  * user-mapped image, the names are supplied and the id's associated
 161  * with them are looked up.  For a layered image, a parent image is
 162  * defined by the tuple, and the names are looked up.
 163  *
 164  * An rbd_dev structure contains a parent_spec pointer which is
 165  * non-null if the image it represents is a child in a layered
 166  * image.  This pointer will refer to the rbd_spec structure used
 167  * by the parent rbd_dev for its own identity (i.e., the structure
 168  * is shared between the parent and child).
 169  *
 170  * Since these structures are populated once, during the discovery
 171  * phase of image construction, they are effectively immutable so
 172  * we make no effort to synchronize access to them.
 173  *
 174  * Note that code herein does not assume the image name is known (it
 175  * could be a null pointer).
 176  */
 177 struct rbd_spec {
 178         u64             pool_id;
 179         const char      *pool_name;
 180
 181         const char      *image_id;
 182         const char      *image_name;
 183
 184         u64             snap_id;
 185         const char      *snap_name;
 186
 187         struct kref     kref;
 188 };
 189
 190 /*
 191  * an instance of the client.  multiple devices may share an rbd client.
 192  */
 193 struct rbd_client {
 194         struct ceph_client      *client;
 195         struct kref             kref;
 196         struct list_head        node;
 197 };
 198
 199 struct rbd_img_request;
 200 typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
 201
 202 #define BAD_WHICH       U32_MAX         /* Good which or bad which, which? */
 203
 204 struct rbd_obj_request;
 205 typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
 206
 207 enum obj_request_type {
 208         OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
 209 };
 210
 211 enum obj_req_flags {
 212         OBJ_REQ_DONE,           /* completion flag: not done = 0, done = 1 */
 213         OBJ_REQ_IMG_DATA,       /* object usage: standalone = 0, image = 1 */
 214         OBJ_REQ_KNOWN,          /* EXISTS flag valid: no = 0, yes = 1 */
 215         OBJ_REQ_EXISTS,         /* target exists: no = 0, yes = 1 */
 216 };
 217
 218 struct rbd_obj_request {
 219         const char              *object_name;
 220         u64                     offset;         /* object start byte */
 221         u64                     length;         /* bytes from offset */
 222         unsigned long           flags;
 223
 224         /*
 225          * An object request associated with an image will have its
 226          * img_data flag set; a standalone object request will not.
 227          *
 228          * A standalone object request will have which == BAD_WHICH
 229          * and a null obj_request pointer.
 230          *
 231          * An object request initiated in support of a layered image
 232          * object (to check for its existence before a write) will
 233          * have which == BAD_WHICH and a non-null obj_request pointer.
 234          *
 235          * Finally, an object request for rbd image data will have
 236          * which != BAD_WHICH, and will have a non-null img_request
 237          * pointer.  The value of which will be in the range
 238          * 0..(img_request->obj_request_count-1).
 239          */
 240         union {
 241                 struct rbd_obj_request  *obj_request;   /* STAT op */
 242                 struct {
 243                         struct rbd_img_request  *img_request;
 244                         u64                     img_offset;
 245                         /* links for img_request->obj_requests list */
 246                         struct list_head        links;
 247                 };
 248         };
 249         u32                     which;          /* posn image request list */
 250
 251         enum obj_request_type   type;
 252         union {
 253                 struct bio      *bio_list;
 254                 struct {
 255                         struct page     **pages;
 256                         u32             page_count;
 257                 };
 258         };
 259         struct page             **copyup_pages;
 260         u32                     copyup_page_count;
 261
 262         struct ceph_osd_request *osd_req;
 263
 264         u64                     xferred;        /* bytes transferred */
 265         int                     result;
 266
 267         rbd_obj_callback_t      callback;
 268         struct completion       completion;
 269
 270         struct kref             kref;
 271 };
 272
 273 enum img_req_flags {
 274         IMG_REQ_WRITE,          /* I/O direction: read = 0, write = 1 */
 275         IMG_REQ_CHILD,          /* initiator: block = 0, child image = 1 */
 276         IMG_REQ_LAYERED,        /* ENOENT handling: normal = 0, layered = 1 */
 277 };
 278
 279 struct rbd_img_request {
 280         struct rbd_device       *rbd_dev;
 281         u64                     offset; /* starting image byte offset */
 282         u64                     length; /* byte count from offset */
 283         unsigned long           flags;
 284         union {
 285                 u64                     snap_id;        /* for reads */
 286                 struct ceph_snap_context *snapc;        /* for writes */
 287         };
 288         union {
 289                 struct request          *rq;            /* block request */
 290                 struct rbd_obj_request  *obj_request;   /* obj req initiator */
 291         };
 292         struct page             **copyup_pages;
 293         u32                     copyup_page_count;
 294         spinlock_t              completion_lock;/* protects next_completion */
 295         u32                     next_completion;
 296         rbd_img_callback_t      callback;
 297         u64                     xferred;/* aggregate bytes transferred */
 298         int                     result; /* first nonzero obj_request result */
 299
 300         u32                     obj_request_count;
 301         struct list_head        obj_requests;   /* rbd_obj_request structs */
 302
 303         struct kref             kref;
 304 };
 305
 306 #define for_each_obj_request(ireq, oreq) \
 307         list_for_each_entry(oreq, &(ireq)->obj_requests, links)
 308 #define for_each_obj_request_from(ireq, oreq) \
 309         list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
 310 #define for_each_obj_request_safe(ireq, oreq, n) \
 311         list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
 312
 313 struct rbd_mapping {
 314         u64                     size;
 315         u64                     features;
 316         bool                    read_only;
 317 };
 318
 319 /*
 320  * a single device
 321  */
 322 struct rbd_device {
 323         int                     dev_id;         /* blkdev unique id */
 324
 325         int                     major;          /* blkdev assigned major */
 326         struct gendisk          *disk;          /* blkdev's gendisk and rq */
 327
 328         u32                     image_format;   /* Either 1 or 2 */
 329         struct rbd_client       *rbd_client;
 330
 331         char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 332
 333         spinlock_t              lock;           /* queue, flags, open_count */
 334
 335         struct rbd_image_header header;
 336         unsigned long           flags;          /* possibly lock protected */
 337         struct rbd_spec         *spec;
 338
 339         char                    *header_name;
 340
 341         struct ceph_file_layout layout;
 342
 343         struct ceph_osd_event   *watch_event;
 344         struct rbd_obj_request  *watch_request;
 345
 346         struct rbd_spec         *parent_spec;
 347         u64                     parent_overlap;
 348         atomic_t                parent_ref;
 349         struct rbd_device       *parent;
 350
 351         /* protects updating the header */
 352         struct rw_semaphore     header_rwsem;
 353
 354         struct rbd_mapping      mapping;
 355
 356         struct list_head        node;
 357
 358         /* sysfs related */
 359         struct device           dev;
 360         unsigned long           open_count;     /* protected by lock */
 361 };
 362
 363 /*
 364  * Flag bits for rbd_dev->flags.  If atomicity is required,
 365  * rbd_dev->lock is used to protect access.
 366  *
 367  * Currently, only the "removing" flag (which is coupled with the
 368  * "open_count" field) requires atomic access.
 369  */
 370 enum rbd_dev_flags {
 371         RBD_DEV_FLAG_EXISTS,    /* mapped snapshot has not been deleted */
 372         RBD_DEV_FLAG_REMOVING,  /* this mapping is being removed */
 373 };
 374
 375 static DEFINE_MUTEX(ctl_mutex);   /* Serialize open/close/setup/teardown */
 376
 377 static LIST_HEAD(rbd_dev_list);    /* devices */
 378 static DEFINE_SPINLOCK(rbd_dev_list_lock);
 379
 380 static LIST_HEAD(rbd_client_list);              /* clients */
 381 static DEFINE_SPINLOCK(rbd_client_list_lock);
 382
 383 /* Slab caches for frequently-allocated structures */
 384
 385 static struct kmem_cache        *rbd_img_request_cache;
 386 static struct kmem_cache        *rbd_obj_request_cache;
 387 static struct kmem_cache        *rbd_segment_name_cache;
 388
 389 static int rbd_img_request_submit(struct rbd_img_request *img_request);
 390
 391 static void rbd_dev_device_release(struct device *dev);
 392
 393 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
 394                        size_t count);
 395 static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
 396                           size_t count);
 397 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
 398 static void rbd_spec_put(struct rbd_spec *spec);
 399
 400 static struct bus_attribute rbd_bus_attrs[] = {
 401         __ATTR(add, S_IWUSR, NULL, rbd_add),
 402         __ATTR(remove, S_IWUSR, NULL, rbd_remove),
 403         __ATTR_NULL
 404 };
 405
 406 static struct bus_type rbd_bus_type = {
 407         .name           = "rbd",
 408         .bus_attrs      = rbd_bus_attrs,
 409 };
 410
 411 static void rbd_root_dev_release(struct device *dev)
 412 {
 413 }
 414
 415 static struct device rbd_root_dev = {
 416         .init_name =    "rbd",
 417         .release =      rbd_root_dev_release,
 418 };
 419
 420 static __printf(2, 3)
 421 void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
 422 {
 423         struct va_format vaf;
 424         va_list args;
 425
 426         va_start(args, fmt);
 427         vaf.fmt = fmt;
 428         vaf.va = &args;
 429
 430         if (!rbd_dev)
 431                 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
 432         else if (rbd_dev->disk)
 433                 printk(KERN_WARNING "%s: %s: %pV\n",
 434                         RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
 435         else if (rbd_dev->spec && rbd_dev->spec->image_name)
 436                 printk(KERN_WARNING "%s: image %s: %pV\n",
 437                         RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
 438         else if (rbd_dev->spec && rbd_dev->spec->image_id)
 439                 printk(KERN_WARNING "%s: id %s: %pV\n",
 440                         RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
 441         else    /* punt */
 442                 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
 443                         RBD_DRV_NAME, rbd_dev, &vaf);
 444         va_end(args);
 445 }
 446
 447 #ifdef RBD_DEBUG
 448 #define rbd_assert(expr)                                                \
 449                 if (unlikely(!(expr))) {                                \
 450                         printk(KERN_ERR "\nAssertion failure in %s() "  \
 451                                                 "at line %d:\n\n"       \
 452                                         "\trbd_assert(%s);\n\n",        \
 453                                         __func__, __LINE__, #expr);     \
 454                         BUG();                                          \
 455                 }
 456 #else /* !RBD_DEBUG */
 457 #  define rbd_assert(expr)      ((void) 0)
 458 #endif /* !RBD_DEBUG */
 459
 460 static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
 461 static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
 462 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
 463
 464 static int rbd_dev_refresh(struct rbd_device *rbd_dev);
 465 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
 466 static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev);
 467 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
 468                                         u64 snap_id);
 469 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 470                                 u8 *order, u64 *snap_size);
 471 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 472                 u64 *snap_features);
 473 static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
 474
 475 static int rbd_open(struct block_device *bdev, fmode_t mode)
 476 {
 477         struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 478         bool removing = false;
 479
 480         if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
 481                 return -EROFS;
 482
 483         spin_lock_irq(&rbd_dev->lock);
 484         if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
 485                 removing = true;
 486         else
 487                 rbd_dev->open_count++;
 488         spin_unlock_irq(&rbd_dev->lock);
 489         if (removing)
 490                 return -ENOENT;
 491
 492         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 493         (void) get_device(&rbd_dev->dev);
 494         set_device_ro(bdev, rbd_dev->mapping.read_only);
 495         mutex_unlock(&ctl_mutex);
 496
 497         return 0;
 498 }
 499
 500 static int rbd_release(struct gendisk *disk, fmode_t mode)
 501 {
 502         struct rbd_device *rbd_dev = disk->private_data;
 503         unsigned long open_count_before;
 504
 505         spin_lock_irq(&rbd_dev->lock);
 506         open_count_before = rbd_dev->open_count--;
 507         spin_unlock_irq(&rbd_dev->lock);
 508         rbd_assert(open_count_before > 0);
 509
 510         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 511         put_device(&rbd_dev->dev);
 512         mutex_unlock(&ctl_mutex);
 513
 514         return 0;
 515 }
 516
 517 static const struct block_device_operations rbd_bd_ops = {
 518         .owner                  = THIS_MODULE,
 519         .open                   = rbd_open,
 520         .release                = rbd_release,
 521 };
 522
 523 /*
 524  * Initialize an rbd client instance.
 525  * We own *ceph_opts.
 526  */
 527 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 528 {
 529         struct rbd_client *rbdc;
 530         int ret = -ENOMEM;
 531
 532         dout("%s:\n", __func__);
 533         rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
 534         if (!rbdc)
 535                 goto out_opt;
 536
 537         kref_init(&rbdc->kref);
 538         INIT_LIST_HEAD(&rbdc->node);
 539
 540         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 541
 542         rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
 543         if (IS_ERR(rbdc->client))
 544                 goto out_mutex;
 545         ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
 546
 547         ret = ceph_open_session(rbdc->client);
 548         if (ret < 0)
 549                 goto out_err;
 550
 551         spin_lock(&rbd_client_list_lock);
 552         list_add_tail(&rbdc->node, &rbd_client_list);
 553         spin_unlock(&rbd_client_list_lock);
 554
 555         mutex_unlock(&ctl_mutex);
 556         dout("%s: rbdc %p\n", __func__, rbdc);
 557
 558         return rbdc;
 559
 560 out_err:
 561         ceph_destroy_client(rbdc->client);
 562 out_mutex:
 563         mutex_unlock(&ctl_mutex);
 564         kfree(rbdc);
 565 out_opt:
 566         if (ceph_opts)
 567                 ceph_destroy_options(ceph_opts);
 568         dout("%s: error %d\n", __func__, ret);
 569
 570         return ERR_PTR(ret);
 571 }
 572
 573 static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
 574 {
 575         kref_get(&rbdc->kref);
 576
 577         return rbdc;
 578 }
 579
 580 /*
 581  * Find a ceph client with specific addr and configuration.  If
 582  * found, bump its reference count.
 583  */
 584 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
 585 {
 586         struct rbd_client *client_node;
 587         bool found = false;
 588
 589         if (ceph_opts->flags & CEPH_OPT_NOSHARE)
 590                 return NULL;
 591
 592         spin_lock(&rbd_client_list_lock);
 593         list_for_each_entry(client_node, &rbd_client_list, node) {
 594                 if (!ceph_compare_options(ceph_opts, client_node->client)) {
 595                         __rbd_get_client(client_node);
 596
 597                         found = true;
 598                         break;
 599                 }
 600         }
 601         spin_unlock(&rbd_client_list_lock);
 602
 603         return found ? client_node : NULL;
 604 }
 605
 606 /*
 607  * mount options
 608  */
 609 enum {
 610         Opt_last_int,
 611         /* int args above */
 612         Opt_last_string,
 613         /* string args above */
 614         Opt_read_only,
 615         Opt_read_write,
 616         /* Boolean args above */
 617         Opt_last_bool,
 618 };
 619
 620 static match_table_t rbd_opts_tokens = {
 621         /* int args above */
 622         /* string args above */
 623         {Opt_read_only, "read_only"},
 624         {Opt_read_only, "ro"},          /* Alternate spelling */
 625         {Opt_read_write, "read_write"},
 626         {Opt_read_write, "rw"},         /* Alternate spelling */
 627         /* Boolean args above */
 628         {-1, NULL}
 629 };
 630
 631 struct rbd_options {
 632         bool    read_only;
 633 };
 634
 635 #define RBD_READ_ONLY_DEFAULT   false
 636
 637 static int parse_rbd_opts_token(char *c, void *private)
 638 {
 639         struct rbd_options *rbd_opts = private;
 640         substring_t argstr[MAX_OPT_ARGS];
 641         int token, intval, ret;
 642
 643         token = match_token(c, rbd_opts_tokens, argstr);
 644         if (token < 0)
 645                 return -EINVAL;
 646
 647         if (token < Opt_last_int) {
 648                 ret = match_int(&argstr[0], &intval);
 649                 if (ret < 0) {
 650                         pr_err("bad mount option arg (not int) "
 651                                "at '%s'\n", c);
 652                         return ret;
 653                 }
 654                 dout("got int token %d val %d\n", token, intval);
 655         } else if (token > Opt_last_int && token < Opt_last_string) {
 656                 dout("got string token %d val %s\n", token,
 657                      argstr[0].from);
 658         } else if (token > Opt_last_string && token < Opt_last_bool) {
 659                 dout("got Boolean token %d\n", token);
 660         } else {
 661                 dout("got token %d\n", token);
 662         }
 663
 664         switch (token) {
 665         case Opt_read_only:
 666                 rbd_opts->read_only = true;
 667                 break;
 668         case Opt_read_write:
 669                 rbd_opts->read_only = false;
 670                 break;
 671         default:
 672                 rbd_assert(false);
 673                 break;
 674         }
 675         return 0;
 676 }
 677
 678 /*
 679  * Get a ceph client with specific addr and configuration, if one does
 680  * not exist create it.
 681  */
 682 static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
 683 {
 684         struct rbd_client *rbdc;
 685
 686         rbdc = rbd_client_find(ceph_opts);
 687         if (rbdc)       /* using an existing client */
 688                 ceph_destroy_options(ceph_opts);
 689         else
 690                 rbdc = rbd_client_create(ceph_opts);
 691
 692         return rbdc;
 693 }
 694
 695 /*
 696  * Destroy ceph client
 697  *
 698  * Caller must hold rbd_client_list_lock.
 699  */
 700 static void rbd_client_release(struct kref *kref)
 701 {
 702         struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 703
 704         dout("%s: rbdc %p\n", __func__, rbdc);
 705         spin_lock(&rbd_client_list_lock);
 706         list_del(&rbdc->node);
 707         spin_unlock(&rbd_client_list_lock);
 708
 709         ceph_destroy_client(rbdc->client);
 710         kfree(rbdc);
 711 }
 712
 713 /*
 714  * Drop reference to ceph client node. If it's not referenced anymore, release
 715  * it.
 716  */
 717 static void rbd_put_client(struct rbd_client *rbdc)
 718 {
 719         if (rbdc)
 720                 kref_put(&rbdc->kref, rbd_client_release);
 721 }
 722
 723 static bool rbd_image_format_valid(u32 image_format)
 724 {
 725         return image_format == 1 || image_format == 2;
 726 }
 727
 728 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
 729 {
 730         size_t size;
 731         u32 snap_count;
 732
 733         /* The header has to start with the magic rbd header text */
 734         if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
 735                 return false;
 736
 737         /* The bio layer requires at least sector-sized I/O */
 738
 739         if (ondisk->options.order < SECTOR_SHIFT)
 740                 return false;
 741
 742         /* If we use u64 in a few spots we may be able to loosen this */
 743
 744         if (ondisk->options.order > 8 * sizeof (int) - 1)
 745                 return false;
 746
 747         /*
 748          * The size of a snapshot header has to fit in a size_t, and
 749          * that limits the number of snapshots.
 750          */
 751         snap_count = le32_to_cpu(ondisk->snap_count);
 752         size = SIZE_MAX - sizeof (struct ceph_snap_context);
 753         if (snap_count > size / sizeof (__le64))
 754                 return false;
 755
 756         /*
 757          * Not only that, but the size of the entire the snapshot
 758          * header must also be representable in a size_t.
 759          */
 760         size -= snap_count * sizeof (__le64);
 761         if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
 762                 return false;
 763
 764         return true;
 765 }
 766
 767 /*
 768  * Fill an rbd image header with information from the given format 1
 769  * on-disk header.
 770  */
 771 static int rbd_header_from_disk(struct rbd_device *rbd_dev,
 772                                  struct rbd_image_header_ondisk *ondisk)
 773 {
 774         struct rbd_image_header *header = &rbd_dev->header;
 775         bool first_time = header->object_prefix == NULL;
 776         struct ceph_snap_context *snapc;
 777         char *object_prefix = NULL;
 778         char *snap_names = NULL;
 779         u64 *snap_sizes = NULL;
 780         u32 snap_count;
 781         size_t size;
 782         int ret = -ENOMEM;
 783         u32 i;
 784
 785         /* Allocate this now to avoid having to handle failure below */
 786
 787         if (first_time) {
 788                 size_t len;
 789
 790                 len = strnlen(ondisk->object_prefix,
 791                                 sizeof (ondisk->object_prefix));
 792                 object_prefix = kmalloc(len + 1, GFP_KERNEL);
 793                 if (!object_prefix)
 794                         return -ENOMEM;
 795                 memcpy(object_prefix, ondisk->object_prefix, len);
 796                 object_prefix[len] = '\0';
 797         }
 798
 799         /* Allocate the snapshot context and fill it in */
 800
 801         snap_count = le32_to_cpu(ondisk->snap_count);
 802         snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
 803         if (!snapc)
 804                 goto out_err;
 805         snapc->seq = le64_to_cpu(ondisk->snap_seq);
 806         if (snap_count) {
 807                 struct rbd_image_snap_ondisk *snaps;
 808                 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 809
 810                 /* We'll keep a copy of the snapshot names... */
 811
 812                 if (snap_names_len > (u64)SIZE_MAX)
 813                         goto out_2big;
 814                 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
 815                 if (!snap_names)
 816                         goto out_err;
 817
 818                 /* ...as well as the array of their sizes. */
 819
 820                 size = snap_count * sizeof (*header->snap_sizes);
 821                 snap_sizes = kmalloc(size, GFP_KERNEL);
 822                 if (!snap_sizes)
 823                         goto out_err;
 824
 825                 /*
 826                  * Copy the names, and fill in each snapshot's id
 827                  * and size.
 828                  *
 829                  * Note that rbd_dev_v1_header_info() guarantees the
 830                  * ondisk buffer we're working with has
 831                  * snap_names_len bytes beyond the end of the
 832                  * snapshot id array, this memcpy() is safe.
 833                  */
 834                 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
 835                 snaps = ondisk->snaps;
 836                 for (i = 0; i < snap_count; i++) {
 837                         snapc->snaps[i] = le64_to_cpu(snaps[i].id);
 838                         snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
 839                 }
 840         }
 841
 842         /* We won't fail any more, fill in the header */
 843
 844         down_write(&rbd_dev->header_rwsem);
 845         if (first_time) {
 846                 header->object_prefix = object_prefix;
 847                 header->obj_order = ondisk->options.order;
 848                 header->crypt_type = ondisk->options.crypt_type;
 849                 header->comp_type = ondisk->options.comp_type;
 850                 /* The rest aren't used for format 1 images */
 851                 header->stripe_unit = 0;
 852                 header->stripe_count = 0;
 853                 header->features = 0;
 854         } else {
 855                 ceph_put_snap_context(header->snapc);
 856                 kfree(header->snap_names);
 857                 kfree(header->snap_sizes);
 858         }
 859
 860         /* The remaining fields always get updated (when we refresh) */
 861
 862         header->image_size = le64_to_cpu(ondisk->image_size);
 863         header->snapc = snapc;
 864         header->snap_names = snap_names;
 865         header->snap_sizes = snap_sizes;
 866
 867         /* Make sure mapping size is consistent with header info */
 868
 869         if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time)
 870                 if (rbd_dev->mapping.size != header->image_size)
 871                         rbd_dev->mapping.size = header->image_size;
 872
 873         up_write(&rbd_dev->header_rwsem);
 874
 875         return 0;
 876 out_2big:
 877         ret = -EIO;
 878 out_err:
 879         kfree(snap_sizes);
 880         kfree(snap_names);
 881         ceph_put_snap_context(snapc);
 882         kfree(object_prefix);
 883
 884         return ret;
 885 }
 886
 887 static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
 888 {
 889         const char *snap_name;
 890
 891         rbd_assert(which < rbd_dev->header.snapc->num_snaps);
 892
 893         /* Skip over names until we find the one we are looking for */
 894
 895         snap_name = rbd_dev->header.snap_names;
 896         while (which--)
 897                 snap_name += strlen(snap_name) + 1;
 898
 899         return kstrdup(snap_name, GFP_KERNEL);
 900 }
 901
 902 /*
 903  * Snapshot id comparison function for use with qsort()/bsearch().
 904  * Note that result is for snapshots in *descending* order.
 905  */
 906 static int snapid_compare_reverse(const void *s1, const void *s2)
 907 {
 908         u64 snap_id1 = *(u64 *)s1;
 909         u64 snap_id2 = *(u64 *)s2;
 910
 911         if (snap_id1 < snap_id2)
 912                 return 1;
 913         return snap_id1 == snap_id2 ? 0 : -1;
 914 }
 915
 916 /*
 917  * Search a snapshot context to see if the given snapshot id is
 918  * present.
 919  *
 920  * Returns the position of the snapshot id in the array if it's found,
 921  * or BAD_SNAP_INDEX otherwise.
 922  *
 923  * Note: The snapshot array is in kept sorted (by the osd) in
 924  * reverse order, highest snapshot id first.
 925  */
 926 static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
 927 {
 928         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
 929         u64 *found;
 930
 931         found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
 932                                 sizeof (snap_id), snapid_compare_reverse);
 933
 934         return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
 935 }
 936
 937 static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
 938                                         u64 snap_id)
 939 {
 940         u32 which;
 941
 942         which = rbd_dev_snap_index(rbd_dev, snap_id);
 943         if (which == BAD_SNAP_INDEX)
 944                 return NULL;
 945
 946         return _rbd_dev_v1_snap_name(rbd_dev, which);
 947 }
 948
 949 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
 950 {
 951         if (snap_id == CEPH_NOSNAP)
 952                 return RBD_SNAP_HEAD_NAME;
 953
 954         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 955         if (rbd_dev->image_format == 1)
 956                 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
 957
 958         return rbd_dev_v2_snap_name(rbd_dev, snap_id);
 959 }
 960
 961 static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 962                                 u64 *snap_size)
 963 {
 964         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 965         if (snap_id == CEPH_NOSNAP) {
 966                 *snap_size = rbd_dev->header.image_size;
 967         } else if (rbd_dev->image_format == 1) {
 968                 u32 which;
 969
 970                 which = rbd_dev_snap_index(rbd_dev, snap_id);
 971                 if (which == BAD_SNAP_INDEX)
 972                         return -ENOENT;
 973
 974                 *snap_size = rbd_dev->header.snap_sizes[which];
 975         } else {
 976                 u64 size = 0;
 977                 int ret;
 978
 979                 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
 980                 if (ret)
 981                         return ret;
 982
 983                 *snap_size = size;
 984         }
 985         return 0;
 986 }
 987
 988 static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 989                         u64 *snap_features)
 990 {
 991         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 992         if (snap_id == CEPH_NOSNAP) {
 993                 *snap_features = rbd_dev->header.features;
 994         } else if (rbd_dev->image_format == 1) {
 995                 *snap_features = 0;     /* No features for format 1 */
 996         } else {
 997                 u64 features = 0;
 998                 int ret;
 999
1000                 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1001                 if (ret)
1002                         return ret;
1003
1004                 *snap_features = features;
1005         }
1006         return 0;
1007 }
1008
1009 static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1010 {
1011         u64 snap_id = rbd_dev->spec->snap_id;
1012         u64 size = 0;
1013         u64 features = 0;
1014         int ret;
1015
1016         ret = rbd_snap_size(rbd_dev, snap_id, &size);
1017         if (ret)
1018                 return ret;
1019         ret = rbd_snap_features(rbd_dev, snap_id, &features);
1020         if (ret)
1021                 return ret;
1022
1023         rbd_dev->mapping.size = size;
1024         rbd_dev->mapping.features = features;
1025
1026         return 0;
1027 }
1028
1029 static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1030 {
1031         rbd_dev->mapping.size = 0;
1032         rbd_dev->mapping.features = 0;
1033 }
1034
1035 static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1036 {
1037         char *name;
1038         u64 segment;
1039         int ret;
1040
1041         name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
1042         if (!name)
1043                 return NULL;
1044         segment = offset >> rbd_dev->header.obj_order;
1045         ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
1046                         rbd_dev->header.object_prefix, segment);
1047         if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
1048                 pr_err("error formatting segment name for #%llu (%d)\n",
1049                         segment, ret);
1050                 kfree(name);
1051                 name = NULL;
1052         }
1053
1054         return name;
1055 }
1056
1057 static void rbd_segment_name_free(const char *name)
1058 {
1059         /* The explicit cast here is needed to drop the const qualifier */
1060
1061         kmem_cache_free(rbd_segment_name_cache, (void *)name);
1062 }
1063
1064 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
1065 {
1066         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1067
1068         return offset & (segment_size - 1);
1069 }
1070
1071 static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1072                                 u64 offset, u64 length)
1073 {
1074         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1075
1076         offset &= segment_size - 1;
1077
1078         rbd_assert(length <= U64_MAX - offset);
1079         if (offset + length > segment_size)
1080                 length = segment_size - offset;
1081
1082         return length;
1083 }
1084
1085 /*
1086  * returns the size of an object in the image
1087  */
1088 static u64 rbd_obj_bytes(struct rbd_image_header *header)
1089 {
1090         return 1 << header->obj_order;
1091 }
1092
1093 /*
1094  * bio helpers
1095  */
1096
1097 static void bio_chain_put(struct bio *chain)
1098 {
1099         struct bio *tmp;
1100
1101         while (chain) {
1102                 tmp = chain;
1103                 chain = chain->bi_next;
1104                 bio_put(tmp);
1105         }
1106 }
1107
1108 /*
1109  * zeros a bio chain, starting at specific offset
1110  */
1111 static void zero_bio_chain(struct bio *chain, int start_ofs)
1112 {
1113         struct bio_vec *bv;
1114         unsigned long flags;
1115         void *buf;
1116         int i;
1117         int pos = 0;
1118
1119         while (chain) {
1120                 bio_for_each_segment(bv, chain, i) {
1121                         if (pos + bv->bv_len > start_ofs) {
1122                                 int remainder = max(start_ofs - pos, 0);
1123                                 buf = bvec_kmap_irq(bv, &flags);
1124                                 memset(buf + remainder, 0,
1125                                        bv->bv_len - remainder);
1126                                 bvec_kunmap_irq(buf, &flags);
1127                         }
1128                         pos += bv->bv_len;
1129                 }
1130
1131                 chain = chain->bi_next;
1132         }
1133 }
1134
1135 /*
1136  * similar to zero_bio_chain(), zeros data defined by a page array,
1137  * starting at the given byte offset from the start of the array and
1138  * continuing up to the given end offset.  The pages array is
1139  * assumed to be big enough to hold all bytes up to the end.
1140  */
1141 static void zero_pages(struct page **pages, u64 offset, u64 end)
1142 {
1143         struct page **page = &pages[offset >> PAGE_SHIFT];
1144
1145         rbd_assert(end > offset);
1146         rbd_assert(end - offset <= (u64)SIZE_MAX);
1147         while (offset < end) {
1148                 size_t page_offset;
1149                 size_t length;
1150                 unsigned long flags;
1151                 void *kaddr;
1152
1153                 page_offset = (size_t)(offset & ~PAGE_MASK);
1154                 length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
1155                 local_irq_save(flags);
1156                 kaddr = kmap_atomic(*page);
1157                 memset(kaddr + page_offset, 0, length);
1158                 kunmap_atomic(kaddr);
1159                 local_irq_restore(flags);
1160
1161                 offset += length;
1162                 page++;
1163         }
1164 }
1165
1166 /*
1167  * Clone a portion of a bio, starting at the given byte offset
1168  * and continuing for the number of bytes indicated.
1169  */
1170 static struct bio *bio_clone_range(struct bio *bio_src,
1171                                         unsigned int offset,
1172                                         unsigned int len,
1173                                         gfp_t gfpmask)
1174 {
1175         struct bio_vec *bv;
1176         unsigned int resid;
1177         unsigned short idx;
1178         unsigned int voff;
1179         unsigned short end_idx;
1180         unsigned short vcnt;
1181         struct bio *bio;
1182
1183         /* Handle the easy case for the caller */
1184
1185         if (!offset && len == bio_src->bi_size)
1186                 return bio_clone(bio_src, gfpmask);
1187
1188         if (WARN_ON_ONCE(!len))
1189                 return NULL;
1190         if (WARN_ON_ONCE(len > bio_src->bi_size))
1191                 return NULL;
1192         if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1193                 return NULL;
1194
1195         /* Find first affected segment... */
1196
1197         resid = offset;
1198         __bio_for_each_segment(bv, bio_src, idx, 0) {
1199                 if (resid < bv->bv_len)
1200                         break;
1201                 resid -= bv->bv_len;
1202         }
1203         voff = resid;
1204
1205         /* ...and the last affected segment */
1206
1207         resid += len;
1208         __bio_for_each_segment(bv, bio_src, end_idx, idx) {
1209                 if (resid <= bv->bv_len)
1210                         break;
1211                 resid -= bv->bv_len;
1212         }
1213         vcnt = end_idx - idx + 1;
1214
1215         /* Build the clone */
1216
1217         bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1218         if (!bio)
1219                 return NULL;    /* ENOMEM */
1220
1221         bio->bi_bdev = bio_src->bi_bdev;
1222         bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1223         bio->bi_rw = bio_src->bi_rw;
1224         bio->bi_flags |= 1 << BIO_CLONED;
1225
1226         /*
1227          * Copy over our part of the bio_vec, then update the first
1228          * and last (or only) entries.
1229          */
1230         memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1231                         vcnt * sizeof (struct bio_vec));
1232         bio->bi_io_vec[0].bv_offset += voff;
1233         if (vcnt > 1) {
1234                 bio->bi_io_vec[0].bv_len -= voff;
1235                 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1236         } else {
1237                 bio->bi_io_vec[0].bv_len = len;
1238         }
1239
1240         bio->bi_vcnt = vcnt;
1241         bio->bi_size = len;
1242         bio->bi_idx = 0;
1243
1244         return bio;
1245 }
1246
1247 /*
1248  * Clone a portion of a bio chain, starting at the given byte offset
1249  * into the first bio in the source chain and continuing for the
1250  * number of bytes indicated.  The result is another bio chain of
1251  * exactly the given length, or a null pointer on error.
1252  *
1253  * The bio_src and offset parameters are both in-out.  On entry they
1254  * refer to the first source bio and the offset into that bio where
1255  * the start of data to be cloned is located.
1256  *
1257  * On return, bio_src is updated to refer to the bio in the source
1258  * chain that contains first un-cloned byte, and *offset will
1259  * contain the offset of that byte within that bio.
1260  */
1261 static struct bio *bio_chain_clone_range(struct bio **bio_src,
1262                                         unsigned int *offset,
1263                                         unsigned int len,
1264                                         gfp_t gfpmask)
1265 {
1266         struct bio *bi = *bio_src;
1267         unsigned int off = *offset;
1268         struct bio *chain = NULL;
1269         struct bio **end;
1270
1271         /* Build up a chain of clone bios up to the limit */
1272
1273         if (!bi || off >= bi->bi_size || !len)
1274                 return NULL;            /* Nothing to clone */
1275
1276         end = &chain;
1277         while (len) {
1278                 unsigned int bi_size;
1279                 struct bio *bio;
1280
1281                 if (!bi) {
1282                         rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1283                         goto out_err;   /* EINVAL; ran out of bio's */
1284                 }
1285                 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1286                 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1287                 if (!bio)
1288                         goto out_err;   /* ENOMEM */
1289
1290                 *end = bio;
1291                 end = &bio->bi_next;
1292
1293                 off += bi_size;
1294                 if (off == bi->bi_size) {
1295                         bi = bi->bi_next;
1296                         off = 0;
1297                 }
1298                 len -= bi_size;
1299         }
1300         *bio_src = bi;
1301         *offset = off;
1302
1303         return chain;
1304 out_err:
1305         bio_chain_put(chain);
1306
1307         return NULL;
1308 }
1309
1310 /*
1311  * The default/initial value for all object request flags is 0.  For
1312  * each flag, once its value is set to 1 it is never reset to 0
1313  * again.
1314  */
1315 static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1316 {
1317         if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
1318                 struct rbd_device *rbd_dev;
1319
1320                 rbd_dev = obj_request->img_request->rbd_dev;
1321                 rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1322                         obj_request);
1323         }
1324 }
1325
1326 static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1327 {
1328         smp_mb();
1329         return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1330 }
1331
1332 static void obj_request_done_set(struct rbd_obj_request *obj_request)
1333 {
1334         if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1335                 struct rbd_device *rbd_dev = NULL;
1336
1337                 if (obj_request_img_data_test(obj_request))
1338                         rbd_dev = obj_request->img_request->rbd_dev;
1339                 rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1340                         obj_request);
1341         }
1342 }
1343
1344 static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1345 {
1346         smp_mb();
1347         return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1348 }
1349
1350 /*
1351  * This sets the KNOWN flag after (possibly) setting the EXISTS
1352  * flag.  The latter is set based on the "exists" value provided.
1353  *
1354  * Note that for our purposes once an object exists it never goes
1355  * away again.  It's possible that the response from two existence
1356  * checks are separated by the creation of the target object, and
1357  * the first ("doesn't exist") response arrives *after* the second
1358  * ("does exist").  In that case we ignore the second one.
1359  */
1360 static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1361                                 bool exists)
1362 {
1363         if (exists)
1364                 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1365         set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1366         smp_mb();
1367 }
1368
1369 static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1370 {
1371         smp_mb();
1372         return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1373 }
1374
1375 static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1376 {
1377         smp_mb();
1378         return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1379 }
1380
1381 static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1382 {
1383         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1384                 atomic_read(&obj_request->kref.refcount));
1385         kref_get(&obj_request->kref);
1386 }
1387
1388 static void rbd_obj_request_destroy(struct kref *kref);
1389 static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1390 {
1391         rbd_assert(obj_request != NULL);
1392         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1393                 atomic_read(&obj_request->kref.refcount));
1394         kref_put(&obj_request->kref, rbd_obj_request_destroy);
1395 }
1396
1397 static bool img_request_child_test(struct rbd_img_request *img_request);
1398 static void rbd_parent_request_destroy(struct kref *kref);
1399 static void rbd_img_request_destroy(struct kref *kref);
1400 static void rbd_img_request_put(struct rbd_img_request *img_request)
1401 {
1402         rbd_assert(img_request != NULL);
1403         dout("%s: img %p (was %d)\n", __func__, img_request,
1404                 atomic_read(&img_request->kref.refcount));
1405         if (img_request_child_test(img_request))
1406                 kref_put(&img_request->kref, rbd_parent_request_destroy);
1407         else
1408                 kref_put(&img_request->kref, rbd_img_request_destroy);
1409 }
1410
1411 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1412                                         struct rbd_obj_request *obj_request)
1413 {
1414         rbd_assert(obj_request->img_request == NULL);
1415
1416         /* Image request now owns object's original reference */
1417         obj_request->img_request = img_request;
1418         obj_request->which = img_request->obj_request_count;
1419         rbd_assert(!obj_request_img_data_test(obj_request));
1420         obj_request_img_data_set(obj_request);
1421         rbd_assert(obj_request->which != BAD_WHICH);
1422         img_request->obj_request_count++;
1423         list_add_tail(&obj_request->links, &img_request->obj_requests);
1424         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1425                 obj_request->which);
1426 }
1427
1428 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1429                                         struct rbd_obj_request *obj_request)
1430 {
1431         rbd_assert(obj_request->which != BAD_WHICH);
1432
1433         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1434                 obj_request->which);
1435         list_del(&obj_request->links);
1436         rbd_assert(img_request->obj_request_count > 0);
1437         img_request->obj_request_count--;
1438         rbd_assert(obj_request->which == img_request->obj_request_count);
1439         obj_request->which = BAD_WHICH;
1440         rbd_assert(obj_request_img_data_test(obj_request));
1441         rbd_assert(obj_request->img_request == img_request);
1442         obj_request->img_request = NULL;
1443         obj_request->callback = NULL;
1444         rbd_obj_request_put(obj_request);
1445 }
1446
1447 static bool obj_request_type_valid(enum obj_request_type type)
1448 {
1449         switch (type) {
1450         case OBJ_REQUEST_NODATA:
1451         case OBJ_REQUEST_BIO:
1452         case OBJ_REQUEST_PAGES:
1453                 return true;
1454         default:
1455                 return false;
1456         }
1457 }
1458
1459 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1460                                 struct rbd_obj_request *obj_request)
1461 {
1462         dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1463
1464         return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1465 }
1466
1467 static void rbd_img_request_complete(struct rbd_img_request *img_request)
1468 {
1469
1470         dout("%s: img %p\n", __func__, img_request);
1471
1472         /*
1473          * If no error occurred, compute the aggregate transfer
1474          * count for the image request.  We could instead use
1475          * atomic64_cmpxchg() to update it as each object request
1476          * completes; not clear which way is better off hand.
1477          */
1478         if (!img_request->result) {
1479                 struct rbd_obj_request *obj_request;
1480                 u64 xferred = 0;
1481
1482                 for_each_obj_request(img_request, obj_request)
1483                         xferred += obj_request->xferred;
1484                 img_request->xferred = xferred;
1485         }
1486
1487         if (img_request->callback)
1488                 img_request->callback(img_request);
1489         else
1490                 rbd_img_request_put(img_request);
1491 }
1492
1493 /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1494
1495 static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1496 {
1497         dout("%s: obj %p\n", __func__, obj_request);
1498
1499         return wait_for_completion_interruptible(&obj_request->completion);
1500 }
1501
1502 /*
1503  * The default/initial value for all image request flags is 0.  Each
1504  * is conditionally set to 1 at image request initialization time
1505  * and currently never change thereafter.
1506  */
1507 static void img_request_write_set(struct rbd_img_request *img_request)
1508 {
1509         set_bit(IMG_REQ_WRITE, &img_request->flags);
1510         smp_mb();
1511 }
1512
1513 static bool img_request_write_test(struct rbd_img_request *img_request)
1514 {
1515         smp_mb();
1516         return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1517 }
1518
1519 static void img_request_child_set(struct rbd_img_request *img_request)
1520 {
1521         set_bit(IMG_REQ_CHILD, &img_request->flags);
1522         smp_mb();
1523 }
1524
1525 static void img_request_child_clear(struct rbd_img_request *img_request)
1526 {
1527         clear_bit(IMG_REQ_CHILD, &img_request->flags);
1528         smp_mb();
1529 }
1530
1531 static bool img_request_child_test(struct rbd_img_request *img_request)
1532 {
1533         smp_mb();
1534         return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1535 }
1536
1537 static void img_request_layered_set(struct rbd_img_request *img_request)
1538 {
1539         set_bit(IMG_REQ_LAYERED, &img_request->flags);
1540         smp_mb();
1541 }
1542
1543 static void img_request_layered_clear(struct rbd_img_request *img_request)
1544 {
1545         clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1546         smp_mb();
1547 }
1548
1549 static bool img_request_layered_test(struct rbd_img_request *img_request)
1550 {
1551         smp_mb();
1552         return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1553 }
1554
1555 static void
1556 rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1557 {
1558         u64 xferred = obj_request->xferred;
1559         u64 length = obj_request->length;
1560
1561         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1562                 obj_request, obj_request->img_request, obj_request->result,
1563                 xferred, length);
1564         /*
1565          * ENOENT means a hole in the image.  We zero-fill the
1566          * entire length of the request.  A short read also implies
1567          * zero-fill to the end of the request.  Either way we
1568          * update the xferred count to indicate the whole request
1569          * was satisfied.
1570          */
1571         rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
1572         if (obj_request->result == -ENOENT) {
1573                 if (obj_request->type == OBJ_REQUEST_BIO)
1574                         zero_bio_chain(obj_request->bio_list, 0);
1575                 else
1576                         zero_pages(obj_request->pages, 0, length);
1577                 obj_request->result = 0;
1578                 obj_request->xferred = length;
1579         } else if (xferred < length && !obj_request->result) {
1580                 if (obj_request->type == OBJ_REQUEST_BIO)
1581                         zero_bio_chain(obj_request->bio_list, xferred);
1582                 else
1583                         zero_pages(obj_request->pages, xferred, length);
1584                 obj_request->xferred = length;
1585         }
1586         obj_request_done_set(obj_request);
1587 }
1588
1589 static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1590 {
1591         dout("%s: obj %p cb %p\n", __func__, obj_request,
1592                 obj_request->callback);
1593         if (obj_request->callback)
1594                 obj_request->callback(obj_request);
1595         else
1596                 complete_all(&obj_request->completion);
1597 }
1598
1599 static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1600 {
1601         dout("%s: obj %p\n", __func__, obj_request);
1602         obj_request_done_set(obj_request);
1603 }
1604
1605 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1606 {
1607         struct rbd_img_request *img_request = NULL;
1608         struct rbd_device *rbd_dev = NULL;
1609         bool layered = false;
1610
1611         if (obj_request_img_data_test(obj_request)) {
1612                 img_request = obj_request->img_request;
1613                 layered = img_request && img_request_layered_test(img_request);
1614                 rbd_dev = img_request->rbd_dev;
1615         }
1616
1617         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1618                 obj_request, img_request, obj_request->result,
1619                 obj_request->xferred, obj_request->length);
1620         if (layered && obj_request->result == -ENOENT &&
1621                         obj_request->img_offset < rbd_dev->parent_overlap)
1622                 rbd_img_parent_read(obj_request);
1623         else if (img_request)
1624                 rbd_img_obj_request_read_callback(obj_request);
1625         else
1626                 obj_request_done_set(obj_request);
1627 }
1628
1629 static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1630 {
1631         dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1632                 obj_request->result, obj_request->length);
1633         /*
1634          * There is no such thing as a successful short write.  Set
1635          * it to our originally-requested length.
1636          */
1637         obj_request->xferred = obj_request->length;
1638         obj_request_done_set(obj_request);
1639 }
1640
1641 /*
1642  * For a simple stat call there's nothing to do.  We'll do more if
1643  * this is part of a write sequence for a layered image.
1644  */
1645 static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1646 {
1647         dout("%s: obj %p\n", __func__, obj_request);
1648         obj_request_done_set(obj_request);
1649 }
1650
1651 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1652                                 struct ceph_msg *msg)
1653 {
1654         struct rbd_obj_request *obj_request = osd_req->r_priv;
1655         u16 opcode;
1656
1657         dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1658         rbd_assert(osd_req == obj_request->osd_req);
1659         if (obj_request_img_data_test(obj_request)) {
1660                 rbd_assert(obj_request->img_request);
1661                 rbd_assert(obj_request->which != BAD_WHICH);
1662         } else {
1663                 rbd_assert(obj_request->which == BAD_WHICH);
1664         }
1665
1666         if (osd_req->r_result < 0)
1667                 obj_request->result = osd_req->r_result;
1668
1669         BUG_ON(osd_req->r_num_ops > 2);
1670
1671         /*
1672          * We support a 64-bit length, but ultimately it has to be
1673          * passed to blk_end_request(), which takes an unsigned int.
1674          */
1675         obj_request->xferred = osd_req->r_reply_op_len[0];
1676         rbd_assert(obj_request->xferred < (u64)UINT_MAX);
1677         opcode = osd_req->r_ops[0].op;
1678         switch (opcode) {
1679         case CEPH_OSD_OP_READ:
1680                 rbd_osd_read_callback(obj_request);
1681                 break;
1682         case CEPH_OSD_OP_WRITE:
1683                 rbd_osd_write_callback(obj_request);
1684                 break;
1685         case CEPH_OSD_OP_STAT:
1686                 rbd_osd_stat_callback(obj_request);
1687                 break;
1688         case CEPH_OSD_OP_CALL:
1689         case CEPH_OSD_OP_NOTIFY_ACK:
1690         case CEPH_OSD_OP_WATCH:
1691                 rbd_osd_trivial_callback(obj_request);
1692                 break;
1693         default:
1694                 rbd_warn(NULL, "%s: unsupported op %hu\n",
1695                         obj_request->object_name, (unsigned short) opcode);
1696                 break;
1697         }
1698
1699         if (obj_request_done_test(obj_request))
1700                 rbd_obj_request_complete(obj_request);
1701 }
1702
1703 static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1704 {
1705         struct rbd_img_request *img_request = obj_request->img_request;
1706         struct ceph_osd_request *osd_req = obj_request->osd_req;
1707         u64 snap_id;
1708
1709         rbd_assert(osd_req != NULL);
1710
1711         snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
1712         ceph_osdc_build_request(osd_req, obj_request->offset,
1713                         NULL, snap_id, NULL);
1714 }
1715
1716 static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1717 {
1718         struct rbd_img_request *img_request = obj_request->img_request;
1719         struct ceph_osd_request *osd_req = obj_request->osd_req;
1720         struct ceph_snap_context *snapc;
1721         struct timespec mtime = CURRENT_TIME;
1722
1723         rbd_assert(osd_req != NULL);
1724
1725         snapc = img_request ? img_request->snapc : NULL;
1726         ceph_osdc_build_request(osd_req, obj_request->offset,
1727                         snapc, CEPH_NOSNAP, &mtime);
1728 }
1729
1730 static struct ceph_osd_request *rbd_osd_req_create(
1731                                         struct rbd_device *rbd_dev,
1732                                         bool write_request,
1733                                         struct rbd_obj_request *obj_request)
1734 {
1735         struct ceph_snap_context *snapc = NULL;
1736         struct ceph_osd_client *osdc;
1737         struct ceph_osd_request *osd_req;
1738
1739         if (obj_request_img_data_test(obj_request)) {
1740                 struct rbd_img_request *img_request = obj_request->img_request;
1741
1742                 rbd_assert(write_request ==
1743                                 img_request_write_test(img_request));
1744                 if (write_request)
1745                         snapc = img_request->snapc;
1746         }
1747
1748         /* Allocate and initialize the request, for the single op */
1749
1750         osdc = &rbd_dev->rbd_client->client->osdc;
1751         osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1752         if (!osd_req)
1753                 return NULL;    /* ENOMEM */
1754
1755         if (write_request)
1756                 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1757         else
1758                 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1759
1760         osd_req->r_callback = rbd_osd_req_callback;
1761         osd_req->r_priv = obj_request;
1762
1763         osd_req->r_oid_len = strlen(obj_request->object_name);
1764         rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1765         memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1766
1767         osd_req->r_file_layout = rbd_dev->layout;       /* struct */
1768
1769         return osd_req;
1770 }
1771
1772 /*
1773  * Create a copyup osd request based on the information in the
1774  * object request supplied.  A copyup request has two osd ops,
1775  * a copyup method call, and a "normal" write request.
1776  */
1777 static struct ceph_osd_request *
1778 rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1779 {
1780         struct rbd_img_request *img_request;
1781         struct ceph_snap_context *snapc;
1782         struct rbd_device *rbd_dev;
1783         struct ceph_osd_client *osdc;
1784         struct ceph_osd_request *osd_req;
1785
1786         rbd_assert(obj_request_img_data_test(obj_request));
1787         img_request = obj_request->img_request;
1788         rbd_assert(img_request);
1789         rbd_assert(img_request_write_test(img_request));
1790
1791         /* Allocate and initialize the request, for the two ops */
1792
1793         snapc = img_request->snapc;
1794         rbd_dev = img_request->rbd_dev;
1795         osdc = &rbd_dev->rbd_client->client->osdc;
1796         osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
1797         if (!osd_req)
1798                 return NULL;    /* ENOMEM */
1799
1800         osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1801         osd_req->r_callback = rbd_osd_req_callback;
1802         osd_req->r_priv = obj_request;
1803
1804         osd_req->r_oid_len = strlen(obj_request->object_name);
1805         rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1806         memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1807
1808         osd_req->r_file_layout = rbd_dev->layout;       /* struct */
1809
1810         return osd_req;
1811 }
1812
1813
1814 static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1815 {
1816         ceph_osdc_put_request(osd_req);
1817 }
1818
1819 /* object_name is assumed to be a non-null pointer and NUL-terminated */
1820
1821 static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1822                                                 u64 offset, u64 length,
1823                                                 enum obj_request_type type)
1824 {
1825         struct rbd_obj_request *obj_request;
1826         size_t size;
1827         char *name;
1828
1829         rbd_assert(obj_request_type_valid(type));
1830
1831         size = strlen(object_name) + 1;
1832         name = kmalloc(size, GFP_KERNEL);
1833         if (!name)
1834                 return NULL;
1835
1836         obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
1837         if (!obj_request) {
1838                 kfree(name);
1839                 return NULL;
1840         }
1841
1842         obj_request->object_name = memcpy(name, object_name, size);
1843         obj_request->offset = offset;
1844         obj_request->length = length;
1845         obj_request->flags = 0;
1846         obj_request->which = BAD_WHICH;
1847         obj_request->type = type;
1848         INIT_LIST_HEAD(&obj_request->links);
1849         init_completion(&obj_request->completion);
1850         kref_init(&obj_request->kref);
1851
1852         dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1853                 offset, length, (int)type, obj_request);
1854
1855         return obj_request;
1856 }
1857
1858 static void rbd_obj_request_destroy(struct kref *kref)
1859 {
1860         struct rbd_obj_request *obj_request;
1861
1862         obj_request = container_of(kref, struct rbd_obj_request, kref);
1863
1864         dout("%s: obj %p\n", __func__, obj_request);
1865
1866         rbd_assert(obj_request->img_request == NULL);
1867         rbd_assert(obj_request->which == BAD_WHICH);
1868
1869         if (obj_request->osd_req)
1870                 rbd_osd_req_destroy(obj_request->osd_req);
1871
1872         rbd_assert(obj_request_type_valid(obj_request->type));
1873         switch (obj_request->type) {
1874         case OBJ_REQUEST_NODATA:
1875                 break;          /* Nothing to do */
1876         case OBJ_REQUEST_BIO:
1877                 if (obj_request->bio_list)
1878                         bio_chain_put(obj_request->bio_list);
1879                 break;
1880         case OBJ_REQUEST_PAGES:
1881                 if (obj_request->pages)
1882                         ceph_release_page_vector(obj_request->pages,
1883                                                 obj_request->page_count);
1884                 break;
1885         }
1886
1887         kfree(obj_request->object_name);
1888         obj_request->object_name = NULL;
1889         kmem_cache_free(rbd_obj_request_cache, obj_request);
1890 }
1891
1892 /* It's OK to call this for a device with no parent */
1893
1894 static void rbd_spec_put(struct rbd_spec *spec);
1895 static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1896 {
1897         rbd_dev_remove_parent(rbd_dev);
1898         rbd_spec_put(rbd_dev->parent_spec);
1899         rbd_dev->parent_spec = NULL;
1900         rbd_dev->parent_overlap = 0;
1901 }
1902
1903 /*
1904  * Parent image reference counting is used to determine when an
1905  * image's parent fields can be safely torn down--after there are no
1906  * more in-flight requests to the parent image.  When the last
1907  * reference is dropped, cleaning them up is safe.
1908  */
1909 static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1910 {
1911         int counter;
1912
1913         if (!rbd_dev->parent_spec)
1914                 return;
1915
1916         counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1917         if (counter > 0)
1918                 return;
1919
1920         /* Last reference; clean up parent data structures */
1921
1922         if (!counter)
1923                 rbd_dev_unparent(rbd_dev);
1924         else
1925                 rbd_warn(rbd_dev, "parent reference underflow\n");
1926 }
1927
1928 /*
1929  * If an image has a non-zero parent overlap, get a reference to its
1930  * parent.
1931  *
1932  * We must get the reference before checking for the overlap to
1933  * coordinate properly with zeroing the parent overlap in
1934  * rbd_dev_v2_parent_info() when an image gets flattened.  We
1935  * drop it again if there is no overlap.
1936  *
1937  * Returns true if the rbd device has a parent with a non-zero
1938  * overlap and a reference for it was successfully taken, or
1939  * false otherwise.
1940  */
1941 static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1942 {
1943         int counter;
1944
1945         if (!rbd_dev->parent_spec)
1946                 return false;
1947
1948         counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1949         if (counter > 0 && rbd_dev->parent_overlap)
1950                 return true;
1951
1952         /* Image was flattened, but parent is not yet torn down */
1953
1954         if (counter < 0)
1955                 rbd_warn(rbd_dev, "parent reference overflow\n");
1956
1957         return false;
1958 }
1959
1960 /*
1961  * Caller is responsible for filling in the list of object requests
1962  * that comprises the image request, and the Linux request pointer
1963  * (if there is one).
1964  */
1965 static struct rbd_img_request *rbd_img_request_create(
1966                                         struct rbd_device *rbd_dev,
1967                                         u64 offset, u64 length,
1968                                         bool write_request)
1969 {
1970         struct rbd_img_request *img_request;
1971
1972         img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
1973         if (!img_request)
1974                 return NULL;
1975
1976         if (write_request) {
1977                 down_read(&rbd_dev->header_rwsem);
1978                 ceph_get_snap_context(rbd_dev->header.snapc);
1979                 up_read(&rbd_dev->header_rwsem);
1980         }
1981
1982         img_request->rq = NULL;
1983         img_request->rbd_dev = rbd_dev;
1984         img_request->offset = offset;
1985         img_request->length = length;
1986         img_request->flags = 0;
1987         if (write_request) {
1988                 img_request_write_set(img_request);
1989                 img_request->snapc = rbd_dev->header.snapc;
1990         } else {
1991                 img_request->snap_id = rbd_dev->spec->snap_id;
1992         }
1993         if (rbd_dev_parent_get(rbd_dev))
1994                 img_request_layered_set(img_request);
1995         spin_lock_init(&img_request->completion_lock);
1996         img_request->next_completion = 0;
1997         img_request->callback = NULL;
1998         img_request->result = 0;
1999         img_request->obj_request_count = 0;
2000         INIT_LIST_HEAD(&img_request->obj_requests);
2001         kref_init(&img_request->kref);
2002
2003         dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
2004                 write_request ? "write" : "read", offset, length,
2005                 img_request);
2006
2007         return img_request;
2008 }
2009
2010 static void rbd_img_request_destroy(struct kref *kref)
2011 {
2012         struct rbd_img_request *img_request;
2013         struct rbd_obj_request *obj_request;
2014         struct rbd_obj_request *next_obj_request;
2015
2016         img_request = container_of(kref, struct rbd_img_request, kref);
2017
2018         dout("%s: img %p\n", __func__, img_request);
2019
2020         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2021                 rbd_img_obj_request_del(img_request, obj_request);
2022         rbd_assert(img_request->obj_request_count == 0);
2023
2024         if (img_request_layered_test(img_request)) {
2025                 img_request_layered_clear(img_request);
2026                 rbd_dev_parent_put(img_request->rbd_dev);
2027         }
2028
2029         if (img_request_write_test(img_request))
2030                 ceph_put_snap_context(img_request->snapc);
2031
2032         kmem_cache_free(rbd_img_request_cache, img_request);
2033 }
2034
2035 static struct rbd_img_request *rbd_parent_request_create(
2036                                         struct rbd_obj_request *obj_request,
2037                                         u64 img_offset, u64 length)
2038 {
2039         struct rbd_img_request *parent_request;
2040         struct rbd_device *rbd_dev;
2041
2042         rbd_assert(obj_request->img_request);
2043         rbd_dev = obj_request->img_request->rbd_dev;
2044
2045         parent_request = rbd_img_request_create(rbd_dev->parent,
2046                                                 img_offset, length, false);
2047         if (!parent_request)
2048                 return NULL;
2049
2050         img_request_child_set(parent_request);
2051         rbd_obj_request_get(obj_request);
2052         parent_request->obj_request = obj_request;
2053
2054         return parent_request;
2055 }
2056
2057 static void rbd_parent_request_destroy(struct kref *kref)
2058 {
2059         struct rbd_img_request *parent_request;
2060         struct rbd_obj_request *orig_request;
2061
2062         parent_request = container_of(kref, struct rbd_img_request, kref);
2063         orig_request = parent_request->obj_request;
2064
2065         parent_request->obj_request = NULL;
2066         rbd_obj_request_put(orig_request);
2067         img_request_child_clear(parent_request);
2068
2069         rbd_img_request_destroy(kref);
2070 }
2071
2072 static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
2073 {
2074         struct rbd_img_request *img_request;
2075         unsigned int xferred;
2076         int result;
2077         bool more;
2078
2079         rbd_assert(obj_request_img_data_test(obj_request));
2080         img_request = obj_request->img_request;
2081
2082         rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
2083         xferred = (unsigned int)obj_request->xferred;
2084         result = obj_request->result;
2085         if (result) {
2086                 struct rbd_device *rbd_dev = img_request->rbd_dev;
2087
2088                 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
2089                         img_request_write_test(img_request) ? "write" : "read",
2090                         obj_request->length, obj_request->img_offset,
2091                         obj_request->offset);
2092                 rbd_warn(rbd_dev, "  result %d xferred %x\n",
2093                         result, xferred);
2094                 if (!img_request->result)
2095                         img_request->result = result;
2096         }
2097
2098         /* Image object requests don't own their page array */
2099
2100         if (obj_request->type == OBJ_REQUEST_PAGES) {
2101                 obj_request->pages = NULL;
2102                 obj_request->page_count = 0;
2103         }
2104
2105         if (img_request_child_test(img_request)) {
2106                 rbd_assert(img_request->obj_request != NULL);
2107                 more = obj_request->which < img_request->obj_request_count - 1;
2108         } else {
2109                 rbd_assert(img_request->rq != NULL);
2110                 more = blk_end_request(img_request->rq, result, xferred);
2111         }
2112
2113         return more;
2114 }
2115
2116 static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
2117 {
2118         struct rbd_img_request *img_request;
2119         u32 which = obj_request->which;
2120         bool more = true;
2121
2122         rbd_assert(obj_request_img_data_test(obj_request));
2123         img_request = obj_request->img_request;
2124
2125         dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
2126         rbd_assert(img_request != NULL);
2127         rbd_assert(img_request->obj_request_count > 0);
2128         rbd_assert(which != BAD_WHICH);
2129         rbd_assert(which < img_request->obj_request_count);
2130         rbd_assert(which >= img_request->next_completion);
2131
2132         spin_lock_irq(&img_request->completion_lock);
2133         if (which != img_request->next_completion)
2134                 goto out;
2135
2136         for_each_obj_request_from(img_request, obj_request) {
2137                 rbd_assert(more);
2138                 rbd_assert(which < img_request->obj_request_count);
2139
2140                 if (!obj_request_done_test(obj_request))
2141                         break;
2142                 more = rbd_img_obj_end_request(obj_request);
2143                 which++;
2144         }
2145
2146         rbd_assert(more ^ (which == img_request->obj_request_count));
2147         img_request->next_completion = which;
2148 out:
2149         spin_unlock_irq(&img_request->completion_lock);
2150
2151         if (!more)
2152                 rbd_img_request_complete(img_request);
2153 }
2154
2155 /*
2156  * Split up an image request into one or more object requests, each
2157  * to a different object.  The "type" parameter indicates whether
2158  * "data_desc" is the pointer to the head of a list of bio
2159  * structures, or the base of a page array.  In either case this
2160  * function assumes data_desc describes memory sufficient to hold
2161  * all data described by the image request.
2162  */
2163 static int rbd_img_request_fill(struct rbd_img_request *img_request,
2164                                         enum obj_request_type type,
2165                                         void *data_desc)
2166 {
2167         struct rbd_device *rbd_dev = img_request->rbd_dev;
2168         struct rbd_obj_request *obj_request = NULL;
2169         struct rbd_obj_request *next_obj_request;
2170         bool write_request = img_request_write_test(img_request);
2171         struct bio *bio_list;
2172         unsigned int bio_offset = 0;
2173         struct page **pages;
2174         u64 img_offset;
2175         u64 resid;
2176         u16 opcode;
2177
2178         dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2179                 (int)type, data_desc);
2180
2181         opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
2182         img_offset = img_request->offset;
2183         resid = img_request->length;
2184         rbd_assert(resid > 0);
2185
2186         if (type == OBJ_REQUEST_BIO) {
2187                 bio_list = data_desc;
2188                 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
2189         } else {
2190                 rbd_assert(type == OBJ_REQUEST_PAGES);
2191                 pages = data_desc;
2192         }
2193
2194         while (resid) {
2195                 struct ceph_osd_request *osd_req;
2196                 const char *object_name;
2197                 u64 offset;
2198                 u64 length;
2199
2200                 object_name = rbd_segment_name(rbd_dev, img_offset);
2201                 if (!object_name)
2202                         goto out_unwind;
2203                 offset = rbd_segment_offset(rbd_dev, img_offset);
2204                 length = rbd_segment_length(rbd_dev, img_offset, resid);
2205                 obj_request = rbd_obj_request_create(object_name,
2206                                                 offset, length, type);
2207                 /* object request has its own copy of the object name */
2208                 rbd_segment_name_free(object_name);
2209                 if (!obj_request)
2210                         goto out_unwind;
2211
2212                 if (type == OBJ_REQUEST_BIO) {
2213                         unsigned int clone_size;
2214
2215                         rbd_assert(length <= (u64)UINT_MAX);
2216                         clone_size = (unsigned int)length;
2217                         obj_request->bio_list =
2218                                         bio_chain_clone_range(&bio_list,
2219                                                                 &bio_offset,
2220                                                                 clone_size,
2221                                                                 GFP_ATOMIC);
2222                         if (!obj_request->bio_list)
2223                                 goto out_partial;
2224                 } else {
2225                         unsigned int page_count;
2226
2227                         obj_request->pages = pages;
2228                         page_count = (u32)calc_pages_for(offset, length);
2229                         obj_request->page_count = page_count;
2230                         if ((offset + length) & ~PAGE_MASK)
2231                                 page_count--;   /* more on last page */
2232                         pages += page_count;
2233                 }
2234
2235                 osd_req = rbd_osd_req_create(rbd_dev, write_request,
2236                                                 obj_request);
2237                 if (!osd_req)
2238                         goto out_partial;
2239                 obj_request->osd_req = osd_req;
2240                 obj_request->callback = rbd_img_obj_callback;
2241
2242                 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
2243                                                 0, 0);
2244                 if (type == OBJ_REQUEST_BIO)
2245                         osd_req_op_extent_osd_data_bio(osd_req, 0,
2246                                         obj_request->bio_list, length);
2247                 else
2248                         osd_req_op_extent_osd_data_pages(osd_req, 0,
2249                                         obj_request->pages, length,
2250                                         offset & ~PAGE_MASK, false, false);
2251
2252                 if (write_request)
2253                         rbd_osd_req_format_write(obj_request);
2254                 else
2255                         rbd_osd_req_format_read(obj_request);
2256
2257                 obj_request->img_offset = img_offset;
2258                 rbd_img_obj_request_add(img_request, obj_request);
2259
2260                 img_offset += length;
2261                 resid -= length;
2262         }
2263
2264         return 0;
2265
2266 out_partial:
2267         rbd_obj_request_put(obj_request);
2268 out_unwind:
2269         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2270                 rbd_obj_request_put(obj_request);
2271
2272         return -ENOMEM;
2273 }
2274
2275 static void
2276 rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
2277 {
2278         struct rbd_img_request *img_request;
2279         struct rbd_device *rbd_dev;
2280         struct page **pages;
2281         u32 page_count;
2282
2283         rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2284         rbd_assert(obj_request_img_data_test(obj_request));
2285         img_request = obj_request->img_request;
2286         rbd_assert(img_request);
2287
2288         rbd_dev = img_request->rbd_dev;
2289         rbd_assert(rbd_dev);
2290
2291         pages = obj_request->copyup_pages;
2292         rbd_assert(pages != NULL);
2293         obj_request->copyup_pages = NULL;
2294         page_count = obj_request->copyup_page_count;
2295         rbd_assert(page_count);
2296         obj_request->copyup_page_count = 0;
2297         ceph_release_page_vector(pages, page_count);
2298
2299         /*
2300          * We want the transfer count to reflect the size of the
2301          * original write request.  There is no such thing as a
2302          * successful short write, so if the request was successful
2303          * we can just set it to the originally-requested length.
2304          */
2305         if (!obj_request->result)
2306                 obj_request->xferred = obj_request->length;
2307
2308         /* Finish up with the normal image object callback */
2309
2310         rbd_img_obj_callback(obj_request);
2311 }
2312
2313 static void
2314 rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2315 {
2316         struct rbd_obj_request *orig_request;
2317         struct ceph_osd_request *osd_req;
2318         struct ceph_osd_client *osdc;
2319         struct rbd_device *rbd_dev;
2320         struct page **pages;
2321         u32 page_count;
2322         int img_result;
2323         u64 parent_length;
2324         u64 offset;
2325         u64 length;
2326
2327         rbd_assert(img_request_child_test(img_request));
2328
2329         /* First get what we need from the image request */
2330
2331         pages = img_request->copyup_pages;
2332         rbd_assert(pages != NULL);
2333         img_request->copyup_pages = NULL;
2334         page_count = img_request->copyup_page_count;
2335         rbd_assert(page_count);
2336         img_request->copyup_page_count = 0;
2337
2338         orig_request = img_request->obj_request;
2339         rbd_assert(orig_request != NULL);
2340         rbd_assert(obj_request_type_valid(orig_request->type));
2341         img_result = img_request->result;
2342         parent_length = img_request->length;
2343         rbd_assert(parent_length == img_request->xferred);
2344         rbd_img_request_put(img_request);
2345
2346         rbd_assert(orig_request->img_request);
2347         rbd_dev = orig_request->img_request->rbd_dev;
2348         rbd_assert(rbd_dev);
2349
2350         /*
2351          * If the overlap has become 0 (most likely because the
2352          * image has been flattened) we need to free the pages
2353          * and re-submit the original write request.
2354          */
2355         if (!rbd_dev->parent_overlap) {
2356                 struct ceph_osd_client *osdc;
2357
2358                 ceph_release_page_vector(pages, page_count);
2359                 osdc = &rbd_dev->rbd_client->client->osdc;
2360                 img_result = rbd_obj_request_submit(osdc, orig_request);
2361                 if (!img_result)
2362                         return;
2363         }
2364
2365         if (img_result)
2366                 goto out_err;
2367
2368         /*
2369          * The original osd request is of no use to use any more.
2370          * We need a new one that can hold the two ops in a copyup
2371          * request.  Allocate the new copyup osd request for the
2372          * original request, and release the old one.
2373          */
2374         img_result = -ENOMEM;
2375         osd_req = rbd_osd_req_create_copyup(orig_request);
2376         if (!osd_req)
2377                 goto out_err;
2378         rbd_osd_req_destroy(orig_request->osd_req);
2379         orig_request->osd_req = osd_req;
2380         orig_request->copyup_pages = pages;
2381         orig_request->copyup_page_count = page_count;
2382
2383         /* Initialize the copyup op */
2384
2385         osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2386         osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
2387                                                 false, false);
2388
2389         /* Then the original write request op */
2390
2391         offset = orig_request->offset;
2392         length = orig_request->length;
2393         osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2394                                         offset, length, 0, 0);
2395         if (orig_request->type == OBJ_REQUEST_BIO)
2396                 osd_req_op_extent_osd_data_bio(osd_req, 1,
2397                                         orig_request->bio_list, length);
2398         else
2399                 osd_req_op_extent_osd_data_pages(osd_req, 1,
2400                                         orig_request->pages, length,
2401                                         offset & ~PAGE_MASK, false, false);
2402
2403         rbd_osd_req_format_write(orig_request);
2404
2405         /* All set, send it off. */
2406
2407         orig_request->callback = rbd_img_obj_copyup_callback;
2408         osdc = &rbd_dev->rbd_client->client->osdc;
2409         img_result = rbd_obj_request_submit(osdc, orig_request);
2410         if (!img_result)
2411                 return;
2412 out_err:
2413         /* Record the error code and complete the request */
2414
2415         orig_request->result = img_result;
2416         orig_request->xferred = 0;
2417         obj_request_done_set(orig_request);
2418         rbd_obj_request_complete(orig_request);
2419 }
2420
2421 /*
2422  * Read from the parent image the range of data that covers the
2423  * entire target of the given object request.  This is used for
2424  * satisfying a layered image write request when the target of an
2425  * object request from the image request does not exist.
2426  *
2427  * A page array big enough to hold the returned data is allocated
2428  * and supplied to rbd_img_request_fill() as the "data descriptor."
2429  * When the read completes, this page array will be transferred to
2430  * the original object request for the copyup operation.
2431  *
2432  * If an error occurs, record it as the result of the original
2433  * object request and mark it done so it gets completed.
2434  */
2435 static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2436 {
2437         struct rbd_img_request *img_request = NULL;
2438         struct rbd_img_request *parent_request = NULL;
2439         struct rbd_device *rbd_dev;
2440         u64 img_offset;
2441         u64 length;
2442         struct page **pages = NULL;
2443         u32 page_count;
2444         int result;
2445
2446         rbd_assert(obj_request_img_data_test(obj_request));
2447         rbd_assert(obj_request_type_valid(obj_request->type));
2448
2449         img_request = obj_request->img_request;
2450         rbd_assert(img_request != NULL);
2451         rbd_dev = img_request->rbd_dev;
2452         rbd_assert(rbd_dev->parent != NULL);
2453
2454         /*
2455          * Determine the byte range covered by the object in the
2456          * child image to which the original request was to be sent.
2457          */
2458         img_offset = obj_request->img_offset - obj_request->offset;
2459         length = (u64)1 << rbd_dev->header.obj_order;
2460
2461         /*
2462          * There is no defined parent data beyond the parent
2463          * overlap, so limit what we read at that boundary if
2464          * necessary.
2465          */
2466         if (img_offset + length > rbd_dev->parent_overlap) {
2467                 rbd_assert(img_offset < rbd_dev->parent_overlap);
2468                 length = rbd_dev->parent_overlap - img_offset;
2469         }
2470
2471         /*
2472          * Allocate a page array big enough to receive the data read
2473          * from the parent.
2474          */
2475         page_count = (u32)calc_pages_for(0, length);
2476         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2477         if (IS_ERR(pages)) {
2478                 result = PTR_ERR(pages);
2479                 pages = NULL;
2480                 goto out_err;
2481         }
2482
2483         result = -ENOMEM;
2484         parent_request = rbd_parent_request_create(obj_request,
2485                                                 img_offset, length);
2486         if (!parent_request)
2487                 goto out_err;
2488
2489         result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2490         if (result)
2491                 goto out_err;
2492         parent_request->copyup_pages = pages;
2493         parent_request->copyup_page_count = page_count;
2494
2495         parent_request->callback = rbd_img_obj_parent_read_full_callback;
2496         result = rbd_img_request_submit(parent_request);
2497         if (!result)
2498                 return 0;
2499
2500         parent_request->copyup_pages = NULL;
2501         parent_request->copyup_page_count = 0;
2502         parent_request->obj_request = NULL;
2503         rbd_obj_request_put(obj_request);
2504 out_err:
2505         if (pages)
2506                 ceph_release_page_vector(pages, page_count);
2507         if (parent_request)
2508                 rbd_img_request_put(parent_request);
2509         obj_request->result = result;
2510         obj_request->xferred = 0;
2511         obj_request_done_set(obj_request);
2512
2513         return result;
2514 }
2515
2516 static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2517 {
2518         struct rbd_obj_request *orig_request;
2519         int result;
2520
2521         rbd_assert(!obj_request_img_data_test(obj_request));
2522
2523         /*
2524          * All we need from the object request is the original
2525          * request and the result of the STAT op.  Grab those, then
2526          * we're done with the request.
2527          */
2528         orig_request = obj_request->obj_request;
2529         obj_request->obj_request = NULL;
2530         rbd_assert(orig_request);
2531         rbd_assert(orig_request->img_request);
2532
2533         result = obj_request->result;
2534         obj_request->result = 0;
2535
2536         dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2537                 obj_request, orig_request, result,
2538                 obj_request->xferred, obj_request->length);
2539         rbd_obj_request_put(obj_request);
2540
2541         rbd_assert(orig_request);
2542         rbd_assert(orig_request->img_request);
2543
2544         /*
2545          * Our only purpose here is to determine whether the object
2546          * exists, and we don't want to treat the non-existence as
2547          * an error.  If something else comes back, transfer the
2548          * error to the original request and complete it now.
2549          */
2550         if (!result) {
2551                 obj_request_existence_set(orig_request, true);
2552         } else if (result == -ENOENT) {
2553                 obj_request_existence_set(orig_request, false);
2554         } else if (result) {
2555                 orig_request->result = result;
2556                 goto out;
2557         }
2558
2559         /*
2560          * Resubmit the original request now that we have recorded
2561          * whether the target object exists.
2562          */
2563         orig_request->result = rbd_img_obj_request_submit(orig_request);
2564 out:
2565         if (orig_request->result)
2566                 rbd_obj_request_complete(orig_request);
2567         rbd_obj_request_put(orig_request);
2568 }
2569
2570 static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2571 {
2572         struct rbd_obj_request *stat_request;
2573         struct rbd_device *rbd_dev;
2574         struct ceph_osd_client *osdc;
2575         struct page **pages = NULL;
2576         u32 page_count;
2577         size_t size;
2578         int ret;
2579
2580         /*
2581          * The response data for a STAT call consists of:
2582          *     le64 length;
2583          *     struct {
2584          *         le32 tv_sec;
2585          *         le32 tv_nsec;
2586          *     } mtime;
2587          */
2588         size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2589         page_count = (u32)calc_pages_for(0, size);
2590         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2591         if (IS_ERR(pages))
2592                 return PTR_ERR(pages);
2593
2594         ret = -ENOMEM;
2595         stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2596                                                         OBJ_REQUEST_PAGES);
2597         if (!stat_request)
2598                 goto out;
2599
2600         rbd_obj_request_get(obj_request);
2601         stat_request->obj_request = obj_request;
2602         stat_request->pages = pages;
2603         stat_request->page_count = page_count;
2604
2605         rbd_assert(obj_request->img_request);
2606         rbd_dev = obj_request->img_request->rbd_dev;
2607         stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2608                                                 stat_request);
2609         if (!stat_request->osd_req)
2610                 goto out;
2611         stat_request->callback = rbd_img_obj_exists_callback;
2612
2613         osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2614         osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2615                                         false, false);
2616         rbd_osd_req_format_read(stat_request);
2617
2618         osdc = &rbd_dev->rbd_client->client->osdc;
2619         ret = rbd_obj_request_submit(osdc, stat_request);
2620 out:
2621         if (ret)
2622                 rbd_obj_request_put(obj_request);
2623
2624         return ret;
2625 }
2626
2627 static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2628 {
2629         struct rbd_img_request *img_request;
2630         struct rbd_device *rbd_dev;
2631         bool known;
2632
2633         rbd_assert(obj_request_img_data_test(obj_request));
2634
2635         img_request = obj_request->img_request;
2636         rbd_assert(img_request);
2637         rbd_dev = img_request->rbd_dev;
2638
2639         /*
2640          * Only writes to layered images need special handling.
2641          * Reads and non-layered writes are simple object requests.
2642          * Layered writes that start beyond the end of the overlap
2643          * with the parent have no parent data, so they too are
2644          * simple object requests.  Finally, if the target object is
2645          * known to already exist, its parent data has already been
2646          * copied, so a write to the object can also be handled as a
2647          * simple object request.
2648          */
2649         if (!img_request_write_test(img_request) ||
2650                 !img_request_layered_test(img_request) ||
2651                 rbd_dev->parent_overlap <= obj_request->img_offset ||
2652                 ((known = obj_request_known_test(obj_request)) &&
2653                         obj_request_exists_test(obj_request))) {
2654
2655                 struct rbd_device *rbd_dev;
2656                 struct ceph_osd_client *osdc;
2657
2658                 rbd_dev = obj_request->img_request->rbd_dev;
2659                 osdc = &rbd_dev->rbd_client->client->osdc;
2660
2661                 return rbd_obj_request_submit(osdc, obj_request);
2662         }
2663
2664         /*
2665          * It's a layered write.  The target object might exist but
2666          * we may not know that yet.  If we know it doesn't exist,
2667          * start by reading the data for the full target object from
2668          * the parent so we can use it for a copyup to the target.
2669          */
2670         if (known)
2671                 return rbd_img_obj_parent_read_full(obj_request);
2672
2673         /* We don't know whether the target exists.  Go find out. */
2674
2675         return rbd_img_obj_exists_submit(obj_request);
2676 }
2677
2678 static int rbd_img_request_submit(struct rbd_img_request *img_request)
2679 {
2680         struct rbd_obj_request *obj_request;
2681         struct rbd_obj_request *next_obj_request;
2682
2683         dout("%s: img %p\n", __func__, img_request);
2684         for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2685                 int ret;
2686
2687                 ret = rbd_img_obj_request_submit(obj_request);
2688                 if (ret)
2689                         return ret;
2690         }
2691
2692         return 0;
2693 }
2694
2695 static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2696 {
2697         struct rbd_obj_request *obj_request;
2698         struct rbd_device *rbd_dev;
2699         u64 obj_end;
2700         u64 img_xferred;
2701         int img_result;
2702
2703         rbd_assert(img_request_child_test(img_request));
2704
2705         /* First get what we need from the image request and release it */
2706
2707         obj_request = img_request->obj_request;
2708         img_xferred = img_request->xferred;
2709         img_result = img_request->result;
2710         rbd_img_request_put(img_request);
2711
2712         /*
2713          * If the overlap has become 0 (most likely because the
2714          * image has been flattened) we need to re-submit the
2715          * original request.
2716          */
2717         rbd_assert(obj_request);
2718         rbd_assert(obj_request->img_request);
2719         rbd_dev = obj_request->img_request->rbd_dev;
2720         if (!rbd_dev->parent_overlap) {
2721                 struct ceph_osd_client *osdc;
2722
2723                 osdc = &rbd_dev->rbd_client->client->osdc;
2724                 img_result = rbd_obj_request_submit(osdc, obj_request);
2725                 if (!img_result)
2726                         return;
2727         }
2728
2729         obj_request->result = img_result;
2730         if (obj_request->result)
2731                 goto out;
2732
2733         /*
2734          * We need to zero anything beyond the parent overlap
2735          * boundary.  Since rbd_img_obj_request_read_callback()
2736          * will zero anything beyond the end of a short read, an
2737          * easy way to do this is to pretend the data from the
2738          * parent came up short--ending at the overlap boundary.
2739          */
2740         rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2741         obj_end = obj_request->img_offset + obj_request->length;
2742         if (obj_end > rbd_dev->parent_overlap) {
2743                 u64 xferred = 0;
2744
2745                 if (obj_request->img_offset < rbd_dev->parent_overlap)
2746                         xferred = rbd_dev->parent_overlap -
2747                                         obj_request->img_offset;
2748
2749                 obj_request->xferred = min(img_xferred, xferred);
2750         } else {
2751                 obj_request->xferred = img_xferred;
2752         }
2753 out:
2754         rbd_img_obj_request_read_callback(obj_request);
2755         rbd_obj_request_complete(obj_request);
2756 }
2757
2758 static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2759 {
2760         struct rbd_img_request *img_request;
2761         int result;
2762
2763         rbd_assert(obj_request_img_data_test(obj_request));
2764         rbd_assert(obj_request->img_request != NULL);
2765         rbd_assert(obj_request->result == (s32) -ENOENT);
2766         rbd_assert(obj_request_type_valid(obj_request->type));
2767
2768         /* rbd_read_finish(obj_request, obj_request->length); */
2769         img_request = rbd_parent_request_create(obj_request,
2770                                                 obj_request->img_offset,
2771                                                 obj_request->length);
2772         result = -ENOMEM;
2773         if (!img_request)
2774                 goto out_err;
2775
2776         if (obj_request->type == OBJ_REQUEST_BIO)
2777                 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2778                                                 obj_request->bio_list);
2779         else
2780                 result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
2781                                                 obj_request->pages);
2782         if (result)
2783                 goto out_err;
2784
2785         img_request->callback = rbd_img_parent_read_callback;
2786         result = rbd_img_request_submit(img_request);
2787         if (result)
2788                 goto out_err;
2789
2790         return;
2791 out_err:
2792         if (img_request)
2793                 rbd_img_request_put(img_request);
2794         obj_request->result = result;
2795         obj_request->xferred = 0;
2796         obj_request_done_set(obj_request);
2797 }
2798
2799 static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id)
2800 {
2801         struct rbd_obj_request *obj_request;
2802         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2803         int ret;
2804
2805         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2806                                                         OBJ_REQUEST_NODATA);
2807         if (!obj_request)
2808                 return -ENOMEM;
2809
2810         ret = -ENOMEM;
2811         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2812         if (!obj_request->osd_req)
2813                 goto out;
2814         obj_request->callback = rbd_obj_request_put;
2815
2816         osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2817                                         notify_id, 0, 0);
2818         rbd_osd_req_format_read(obj_request);
2819
2820         ret = rbd_obj_request_submit(osdc, obj_request);
2821 out:
2822         if (ret)
2823                 rbd_obj_request_put(obj_request);
2824
2825         return ret;
2826 }
2827
2828 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2829 {
2830         struct rbd_device *rbd_dev = (struct rbd_device *)data;
2831         int ret;
2832
2833         if (!rbd_dev)
2834                 return;
2835
2836         dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2837                 rbd_dev->header_name, (unsigned long long)notify_id,
2838                 (unsigned int)opcode);
2839         ret = rbd_dev_refresh(rbd_dev);
2840         if (ret)
2841                 rbd_warn(rbd_dev, ": header refresh error (%d)\n", ret);
2842
2843         rbd_obj_notify_ack(rbd_dev, notify_id);
2844 }
2845
2846 /*
2847  * Request sync osd watch/unwatch.  The value of "start" determines
2848  * whether a watch request is being initiated or torn down.
2849  */
2850 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
2851 {
2852         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2853         struct rbd_obj_request *obj_request;
2854         int ret;
2855
2856         rbd_assert(start ^ !!rbd_dev->watch_event);
2857         rbd_assert(start ^ !!rbd_dev->watch_request);
2858
2859         if (start) {
2860                 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
2861                                                 &rbd_dev->watch_event);
2862                 if (ret < 0)
2863                         return ret;
2864                 rbd_assert(rbd_dev->watch_event != NULL);
2865         }
2866
2867         ret = -ENOMEM;
2868         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2869                                                         OBJ_REQUEST_NODATA);
2870         if (!obj_request)
2871                 goto out_cancel;
2872
2873         obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2874         if (!obj_request->osd_req)
2875                 goto out_cancel;
2876
2877         if (start)
2878                 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
2879         else
2880                 ceph_osdc_unregister_linger_request(osdc,
2881                                         rbd_dev->watch_request->osd_req);
2882
2883         osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2884                                 rbd_dev->watch_event->cookie, 0, start ? 1 : 0);
2885         rbd_osd_req_format_write(obj_request);
2886
2887         ret = rbd_obj_request_submit(osdc, obj_request);
2888         if (ret)
2889                 goto out_cancel;
2890         ret = rbd_obj_request_wait(obj_request);
2891         if (ret)
2892                 goto out_cancel;
2893         ret = obj_request->result;
2894         if (ret)
2895                 goto out_cancel;
2896
2897         /*
2898          * A watch request is set to linger, so the underlying osd
2899          * request won't go away until we unregister it.  We retain
2900          * a pointer to the object request during that time (in
2901          * rbd_dev->watch_request), so we'll keep a reference to
2902          * it.  We'll drop that reference (below) after we've
2903          * unregistered it.
2904          */
2905         if (start) {
2906                 rbd_dev->watch_request = obj_request;
2907
2908                 return 0;
2909         }
2910
2911         /* We have successfully torn down the watch request */
2912
2913         rbd_obj_request_put(rbd_dev->watch_request);
2914         rbd_dev->watch_request = NULL;
2915 out_cancel:
2916         /* Cancel the event if we're tearing down, or on error */
2917         ceph_osdc_cancel_event(rbd_dev->watch_event);
2918         rbd_dev->watch_event = NULL;
2919         if (obj_request)
2920                 rbd_obj_request_put(obj_request);
2921
2922         return ret;
2923 }
2924
2925 /*
2926  * Synchronous osd object method call.  Returns the number of bytes
2927  * returned in the outbound buffer, or a negative error code.
2928  */
2929 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
2930                              const char *object_name,
2931                              const char *class_name,
2932                              const char *method_name,
2933                              const void *outbound,
2934                              size_t outbound_size,
2935                              void *inbound,
2936                              size_t inbound_size)
2937 {
2938         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2939         struct rbd_obj_request *obj_request;
2940         struct page **pages;
2941         u32 page_count;
2942         int ret;
2943
2944         /*
2945          * Method calls are ultimately read operations.  The result
2946          * should placed into the inbound buffer provided.  They
2947          * also supply outbound data--parameters for the object
2948          * method.  Currently if this is present it will be a
2949          * snapshot id.
2950          */
2951         page_count = (u32)calc_pages_for(0, inbound_size);
2952         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2953         if (IS_ERR(pages))
2954                 return PTR_ERR(pages);
2955
2956         ret = -ENOMEM;
2957         obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
2958                                                         OBJ_REQUEST_PAGES);
2959         if (!obj_request)
2960                 goto out;
2961
2962         obj_request->pages = pages;
2963         obj_request->page_count = page_count;
2964
2965         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2966         if (!obj_request->osd_req)
2967                 goto out;
2968
2969         osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
2970                                         class_name, method_name);
2971         if (outbound_size) {
2972                 struct ceph_pagelist *pagelist;
2973
2974                 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
2975                 if (!pagelist)
2976                         goto out;
2977
2978                 ceph_pagelist_init(pagelist);
2979                 ceph_pagelist_append(pagelist, outbound, outbound_size);
2980                 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
2981                                                 pagelist);
2982         }
2983         osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2984                                         obj_request->pages, inbound_size,
2985                                         0, false, false);
2986         rbd_osd_req_format_read(obj_request);
2987
2988         ret = rbd_obj_request_submit(osdc, obj_request);
2989         if (ret)
2990                 goto out;
2991         ret = rbd_obj_request_wait(obj_request);
2992         if (ret)
2993                 goto out;
2994
2995         ret = obj_request->result;
2996         if (ret < 0)
2997                 goto out;
2998
2999         rbd_assert(obj_request->xferred < (u64)INT_MAX);
3000         ret = (int)obj_request->xferred;
3001         ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
3002 out:
3003         if (obj_request)
3004                 rbd_obj_request_put(obj_request);
3005         else
3006                 ceph_release_page_vector(pages, page_count);
3007
3008         return ret;
3009 }
3010
3011 static void rbd_request_fn(struct request_queue *q)
3012                 __releases(q->queue_lock) __acquires(q->queue_lock)
3013 {
3014         struct rbd_device *rbd_dev = q->queuedata;
3015         bool read_only = rbd_dev->mapping.read_only;
3016         struct request *rq;
3017         int result;
3018
3019         while ((rq = blk_fetch_request(q))) {
3020                 bool write_request = rq_data_dir(rq) == WRITE;
3021                 struct rbd_img_request *img_request;
3022                 u64 offset;
3023                 u64 length;
3024
3025                 /* Ignore any non-FS requests that filter through. */
3026
3027                 if (rq->cmd_type != REQ_TYPE_FS) {
3028                         dout("%s: non-fs request type %d\n", __func__,
3029                                 (int) rq->cmd_type);
3030                         __blk_end_request_all(rq, 0);
3031                         continue;
3032                 }
3033
3034                 /* Ignore/skip any zero-length requests */
3035
3036                 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
3037                 length = (u64) blk_rq_bytes(rq);
3038
3039                 if (!length) {
3040                         dout("%s: zero-length request\n", __func__);
3041                         __blk_end_request_all(rq, 0);
3042                         continue;
3043                 }
3044
3045                 spin_unlock_irq(q->queue_lock);
3046
3047                 /* Disallow writes to a read-only device */
3048
3049                 if (write_request) {
3050                         result = -EROFS;
3051                         if (read_only)
3052                                 goto end_request;
3053                         rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
3054                 }
3055
3056                 /*
3057                  * Quit early if the mapped snapshot no longer
3058                  * exists.  It's still possible the snapshot will
3059                  * have disappeared by the time our request arrives
3060                  * at the osd, but there's no sense in sending it if
3061                  * we already know.
3062                  */
3063                 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3064                         dout("request for non-existent snapshot");
3065                         rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3066                         result = -ENXIO;
3067                         goto end_request;
3068                 }
3069
3070                 result = -EINVAL;
3071                 if (offset && length > U64_MAX - offset + 1) {
3072                         rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
3073                                 offset, length);
3074                         goto end_request;       /* Shouldn't happen */
3075                 }
3076
3077                 result = -EIO;
3078                 if (offset + length > rbd_dev->mapping.size) {
3079                         rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
3080                                 offset, length, rbd_dev->mapping.size);
3081                         goto end_request;
3082                 }
3083
3084                 result = -ENOMEM;
3085                 img_request = rbd_img_request_create(rbd_dev, offset, length,
3086                                                         write_request);
3087                 if (!img_request)
3088                         goto end_request;
3089
3090                 img_request->rq = rq;
3091
3092                 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3093                                                 rq->bio);
3094                 if (!result)
3095                         result = rbd_img_request_submit(img_request);
3096                 if (result)
3097                         rbd_img_request_put(img_request);
3098 end_request:
3099                 spin_lock_irq(q->queue_lock);
3100                 if (result < 0) {
3101                         rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
3102                                 write_request ? "write" : "read",
3103                                 length, offset, result);
3104
3105                         __blk_end_request_all(rq, result);
3106                 }
3107         }
3108 }
3109
3110 /*
3111  * a queue callback. Makes sure that we don't create a bio that spans across
3112  * multiple osd objects. One exception would be with a single page bios,
3113  * which we handle later at bio_chain_clone_range()
3114  */
3115 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
3116                           struct bio_vec *bvec)
3117 {
3118         struct rbd_device *rbd_dev = q->queuedata;
3119         sector_t sector_offset;
3120         sector_t sectors_per_obj;
3121         sector_t obj_sector_offset;
3122         int ret;
3123
3124         /*
3125          * Find how far into its rbd object the partition-relative
3126          * bio start sector is to offset relative to the enclosing
3127          * device.
3128          */
3129         sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
3130         sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
3131         obj_sector_offset = sector_offset & (sectors_per_obj - 1);
3132
3133         /*
3134          * Compute the number of bytes from that offset to the end
3135          * of the object.  Account for what's already used by the bio.
3136          */
3137         ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
3138         if (ret > bmd->bi_size)
3139                 ret -= bmd->bi_size;
3140         else
3141                 ret = 0;
3142
3143         /*
3144          * Don't send back more than was asked for.  And if the bio
3145          * was empty, let the whole thing through because:  "Note
3146          * that a block device *must* allow a single page to be
3147          * added to an empty bio."
3148          */
3149         rbd_assert(bvec->bv_len <= PAGE_SIZE);
3150         if (ret > (int) bvec->bv_len || !bmd->bi_size)
3151                 ret = (int) bvec->bv_len;
3152
3153         return ret;
3154 }
3155
3156 static void rbd_free_disk(struct rbd_device *rbd_dev)
3157 {
3158         struct gendisk *disk = rbd_dev->disk;
3159
3160         if (!disk)
3161                 return;
3162
3163         rbd_dev->disk = NULL;
3164         if (disk->flags & GENHD_FL_UP) {
3165                 del_gendisk(disk);
3166                 if (disk->queue)
3167                         blk_cleanup_queue(disk->queue);
3168         }
3169         put_disk(disk);
3170 }
3171
3172 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3173                                 const char *object_name,
3174                                 u64 offset, u64 length, void *buf)
3175
3176 {
3177         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3178         struct rbd_obj_request *obj_request;
3179         struct page **pages = NULL;
3180         u32 page_count;
3181         size_t size;
3182         int ret;
3183
3184         page_count = (u32) calc_pages_for(offset, length);
3185         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3186         if (IS_ERR(pages))
3187                 ret = PTR_ERR(pages);
3188
3189         ret = -ENOMEM;
3190         obj_request = rbd_obj_request_create(object_name, offset, length,
3191                                                         OBJ_REQUEST_PAGES);
3192         if (!obj_request)
3193                 goto out;
3194
3195         obj_request->pages = pages;
3196         obj_request->page_count = page_count;
3197
3198         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
3199         if (!obj_request->osd_req)
3200                 goto out;
3201
3202         osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3203                                         offset, length, 0, 0);
3204         osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
3205                                         obj_request->pages,
3206                                         obj_request->length,
3207                                         obj_request->offset & ~PAGE_MASK,
3208                                         false, false);
3209         rbd_osd_req_format_read(obj_request);
3210
3211         ret = rbd_obj_request_submit(osdc, obj_request);
3212         if (ret)
3213                 goto out;
3214         ret = rbd_obj_request_wait(obj_request);
3215         if (ret)
3216                 goto out;
3217
3218         ret = obj_request->result;
3219         if (ret < 0)
3220                 goto out;
3221
3222         rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
3223         size = (size_t) obj_request->xferred;
3224         ceph_copy_from_page_vector(pages, buf, 0, size);
3225         rbd_assert(size <= (size_t)INT_MAX);
3226         ret = (int)size;
3227 out:
3228         if (obj_request)
3229                 rbd_obj_request_put(obj_request);
3230         else
3231                 ceph_release_page_vector(pages, page_count);
3232
3233         return ret;
3234 }
3235
3236 /*
3237  * Read the complete header for the given rbd device.  On successful
3238  * return, the rbd_dev->header field will contain up-to-date
3239  * information about the image.
3240  */
3241 static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
3242 {
3243         struct rbd_image_header_ondisk *ondisk = NULL;
3244         u32 snap_count = 0;
3245         u64 names_size = 0;
3246         u32 want_count;
3247         int ret;
3248
3249         /*
3250          * The complete header will include an array of its 64-bit
3251          * snapshot ids, followed by the names of those snapshots as
3252          * a contiguous block of NUL-terminated strings.  Note that
3253          * the number of snapshots could change by the time we read
3254          * it in, in which case we re-read it.
3255          */
3256         do {
3257                 size_t size;
3258
3259                 kfree(ondisk);
3260
3261                 size = sizeof (*ondisk);
3262                 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3263                 size += names_size;
3264                 ondisk = kmalloc(size, GFP_KERNEL);
3265                 if (!ondisk)
3266                         return -ENOMEM;
3267
3268                 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
3269                                        0, size, ondisk);
3270                 if (ret < 0)
3271                         goto out;
3272                 if ((size_t)ret < size) {
3273                         ret = -ENXIO;
3274                         rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3275                                 size, ret);
3276                         goto out;
3277                 }
3278                 if (!rbd_dev_ondisk_valid(ondisk)) {
3279                         ret = -ENXIO;
3280                         rbd_warn(rbd_dev, "invalid header");
3281                         goto out;
3282                 }
3283
3284                 names_size = le64_to_cpu(ondisk->snap_names_len);
3285                 want_count = snap_count;
3286                 snap_count = le32_to_cpu(ondisk->snap_count);
3287         } while (snap_count != want_count);
3288
3289         ret = rbd_header_from_disk(rbd_dev, ondisk);
3290 out:
3291         kfree(ondisk);
3292
3293         return ret;
3294 }
3295
3296 /*
3297  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3298  * has disappeared from the (just updated) snapshot context.
3299  */
3300 static void rbd_exists_validate(struct rbd_device *rbd_dev)
3301 {
3302         u64 snap_id;
3303
3304         if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3305                 return;
3306
3307         snap_id = rbd_dev->spec->snap_id;
3308         if (snap_id == CEPH_NOSNAP)
3309                 return;
3310
3311         if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3312                 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3313 }
3314
3315 static int rbd_dev_refresh(struct rbd_device *rbd_dev)
3316 {
3317         u64 mapping_size;
3318         int ret;
3319
3320         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3321         mapping_size = rbd_dev->mapping.size;
3322         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3323         if (rbd_dev->image_format == 1)
3324                 ret = rbd_dev_v1_header_info(rbd_dev);
3325         else
3326                 ret = rbd_dev_v2_header_info(rbd_dev);
3327
3328         /* If it's a mapped snapshot, validate its EXISTS flag */
3329
3330         rbd_exists_validate(rbd_dev);
3331         mutex_unlock(&ctl_mutex);
3332         if (mapping_size != rbd_dev->mapping.size) {
3333                 sector_t size;
3334
3335                 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3336                 dout("setting size to %llu sectors", (unsigned long long)size);
3337                 set_capacity(rbd_dev->disk, size);
3338                 revalidate_disk(rbd_dev->disk);
3339         }
3340
3341         return ret;
3342 }
3343
3344 static int rbd_init_disk(struct rbd_device *rbd_dev)
3345 {
3346         struct gendisk *disk;
3347         struct request_queue *q;
3348         u64 segment_size;
3349
3350         /* create gendisk info */
3351         disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3352         if (!disk)
3353                 return -ENOMEM;
3354
3355         snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3356                  rbd_dev->dev_id);
3357         disk->major = rbd_dev->major;
3358         disk->first_minor = 0;
3359         disk->fops = &rbd_bd_ops;
3360         disk->private_data = rbd_dev;
3361
3362         q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3363         if (!q)
3364                 goto out_disk;
3365
3366         /* We use the default size, but let's be explicit about it. */
3367         blk_queue_physical_block_size(q, SECTOR_SIZE);
3368
3369         /* set io sizes to object size */
3370         segment_size = rbd_obj_bytes(&rbd_dev->header);
3371         blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3372         blk_queue_max_segment_size(q, segment_size);
3373         blk_queue_io_min(q, segment_size);
3374         blk_queue_io_opt(q, segment_size);
3375
3376         blk_queue_merge_bvec(q, rbd_merge_bvec);
3377         disk->queue = q;
3378
3379         q->queuedata = rbd_dev;
3380
3381         rbd_dev->disk = disk;
3382
3383         return 0;
3384 out_disk:
3385         put_disk(disk);
3386
3387         return -ENOMEM;
3388 }
3389
3390 /*
3391   sysfs
3392 */
3393
3394 static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3395 {
3396         return container_of(dev, struct rbd_device, dev);
3397 }
3398
3399 static ssize_t rbd_size_show(struct device *dev,
3400                              struct device_attribute *attr, char *buf)
3401 {
3402         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3403
3404         return sprintf(buf, "%llu\n",
3405                 (unsigned long long)rbd_dev->mapping.size);
3406 }
3407
3408 /*
3409  * Note this shows the features for whatever's mapped, which is not
3410  * necessarily the base image.
3411  */
3412 static ssize_t rbd_features_show(struct device *dev,
3413                              struct device_attribute *attr, char *buf)
3414 {
3415         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3416
3417         return sprintf(buf, "0x%016llx\n",
3418                         (unsigned long long)rbd_dev->mapping.features);
3419 }
3420
3421 static ssize_t rbd_major_show(struct device *dev,
3422                               struct device_attribute *attr, char *buf)
3423 {
3424         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3425
3426         if (rbd_dev->major)
3427                 return sprintf(buf, "%d\n", rbd_dev->major);
3428
3429         return sprintf(buf, "(none)\n");
3430
3431 }
3432
3433 static ssize_t rbd_client_id_show(struct device *dev,
3434                                   struct device_attribute *attr, char *buf)
3435 {
3436         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3437
3438         return sprintf(buf, "client%lld\n",
3439                         ceph_client_id(rbd_dev->rbd_client->client));
3440 }
3441
3442 static ssize_t rbd_pool_show(struct device *dev,
3443                              struct device_attribute *attr, char *buf)
3444 {
3445         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3446
3447         return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3448 }
3449
3450 static ssize_t rbd_pool_id_show(struct device *dev,
3451                              struct device_attribute *attr, char *buf)
3452 {
3453         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3454
3455         return sprintf(buf, "%llu\n",
3456                         (unsigned long long) rbd_dev->spec->pool_id);
3457 }
3458
3459 static ssize_t rbd_name_show(struct device *dev,
3460                              struct device_attribute *attr, char *buf)
3461 {
3462         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3463
3464         if (rbd_dev->spec->image_name)
3465                 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3466
3467         return sprintf(buf, "(unknown)\n");
3468 }
3469
3470 static ssize_t rbd_image_id_show(struct device *dev,
3471                              struct device_attribute *attr, char *buf)
3472 {
3473         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3474
3475         return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3476 }
3477
3478 /*
3479  * Shows the name of the currently-mapped snapshot (or
3480  * RBD_SNAP_HEAD_NAME for the base image).
3481  */
3482 static ssize_t rbd_snap_show(struct device *dev,
3483                              struct device_attribute *attr,
3484                              char *buf)
3485 {
3486         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3487
3488         return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3489 }
3490
3491 /*
3492  * For an rbd v2 image, shows the pool id, image id, and snapshot id
3493  * for the parent image.  If there is no parent, simply shows
3494  * "(no parent image)".
3495  */
3496 static ssize_t rbd_parent_show(struct device *dev,
3497                              struct device_attribute *attr,
3498                              char *buf)
3499 {
3500         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3501         struct rbd_spec *spec = rbd_dev->parent_spec;
3502         int count;
3503         char *bufp = buf;
3504
3505         if (!spec)
3506                 return sprintf(buf, "(no parent image)\n");
3507
3508         count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
3509                         (unsigned long long) spec->pool_id, spec->pool_name);
3510         if (count < 0)
3511                 return count;
3512         bufp += count;
3513
3514         count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
3515                         spec->image_name ? spec->image_name : "(unknown)");
3516         if (count < 0)
3517                 return count;
3518         bufp += count;
3519
3520         count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
3521                         (unsigned long long) spec->snap_id, spec->snap_name);
3522         if (count < 0)
3523                 return count;
3524         bufp += count;
3525
3526         count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
3527         if (count < 0)
3528                 return count;
3529         bufp += count;
3530
3531         return (ssize_t) (bufp - buf);
3532 }
3533
3534 static ssize_t rbd_image_refresh(struct device *dev,
3535                                  struct device_attribute *attr,
3536                                  const char *buf,
3537                                  size_t size)
3538 {
3539         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3540         int ret;
3541
3542         ret = rbd_dev_refresh(rbd_dev);
3543         if (ret)
3544                 rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret);
3545
3546         return ret < 0 ? ret : size;
3547 }
3548
3549 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
3550 static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3551 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3552 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3553 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
3554 static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3555 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3556 static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3557 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3558 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
3559 static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3560
3561 static struct attribute *rbd_attrs[] = {
3562         &dev_attr_size.attr,
3563         &dev_attr_features.attr,
3564         &dev_attr_major.attr,
3565         &dev_attr_client_id.attr,
3566         &dev_attr_pool.attr,
3567         &dev_attr_pool_id.attr,
3568         &dev_attr_name.attr,
3569         &dev_attr_image_id.attr,
3570         &dev_attr_current_snap.attr,
3571         &dev_attr_parent.attr,
3572         &dev_attr_refresh.attr,
3573         NULL
3574 };
3575
3576 static struct attribute_group rbd_attr_group = {
3577         .attrs = rbd_attrs,
3578 };
3579
3580 static const struct attribute_group *rbd_attr_groups[] = {
3581         &rbd_attr_group,
3582         NULL
3583 };
3584
3585 static void rbd_sysfs_dev_release(struct device *dev)
3586 {
3587 }
3588
3589 static struct device_type rbd_device_type = {
3590         .name           = "rbd",
3591         .groups         = rbd_attr_groups,
3592         .release        = rbd_sysfs_dev_release,
3593 };
3594
3595 static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
3596 {
3597         kref_get(&spec->kref);
3598
3599         return spec;
3600 }
3601
3602 static void rbd_spec_free(struct kref *kref);
3603 static void rbd_spec_put(struct rbd_spec *spec)
3604 {
3605         if (spec)
3606                 kref_put(&spec->kref, rbd_spec_free);
3607 }
3608
3609 static struct rbd_spec *rbd_spec_alloc(void)
3610 {
3611         struct rbd_spec *spec;
3612
3613         spec = kzalloc(sizeof (*spec), GFP_KERNEL);
3614         if (!spec)
3615                 return NULL;
3616         kref_init(&spec->kref);
3617
3618         return spec;
3619 }
3620
3621 static void rbd_spec_free(struct kref *kref)
3622 {
3623         struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
3624
3625         kfree(spec->pool_name);
3626         kfree(spec->image_id);
3627         kfree(spec->image_name);
3628         kfree(spec->snap_name);
3629         kfree(spec);
3630 }
3631
3632 static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3633                                 struct rbd_spec *spec)
3634 {
3635         struct rbd_device *rbd_dev;
3636
3637         rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3638         if (!rbd_dev)
3639                 return NULL;
3640
3641         spin_lock_init(&rbd_dev->lock);
3642         rbd_dev->flags = 0;
3643         atomic_set(&rbd_dev->parent_ref, 0);
3644         INIT_LIST_HEAD(&rbd_dev->node);
3645         init_rwsem(&rbd_dev->header_rwsem);
3646
3647         rbd_dev->spec = spec;
3648         rbd_dev->rbd_client = rbdc;
3649
3650         /* Initialize the layout used for all rbd requests */
3651
3652         rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3653         rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
3654         rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3655         rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
3656
3657         return rbd_dev;
3658 }
3659
3660 static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3661 {
3662         rbd_put_client(rbd_dev->rbd_client);
3663         rbd_spec_put(rbd_dev->spec);
3664         kfree(rbd_dev);
3665 }
3666
3667 /*
3668  * Get the size and object order for an image snapshot, or if
3669  * snap_id is CEPH_NOSNAP, gets this information for the base
3670  * image.
3671  */
3672 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
3673                                 u8 *order, u64 *snap_size)
3674 {
3675         __le64 snapid = cpu_to_le64(snap_id);
3676         int ret;
3677         struct {
3678                 u8 order;
3679                 __le64 size;
3680         } __attribute__ ((packed)) size_buf = { 0 };
3681
3682         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3683                                 "rbd", "get_size",
3684                                 &snapid, sizeof (snapid),
3685                                 &size_buf, sizeof (size_buf));
3686         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3687         if (ret < 0)
3688                 return ret;
3689         if (ret < sizeof (size_buf))
3690                 return -ERANGE;
3691
3692         if (order)
3693                 *order = size_buf.order;
3694         *snap_size = le64_to_cpu(size_buf.size);
3695
3696         dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
3697                 (unsigned long long)snap_id, (unsigned int)*order,
3698                 (unsigned long long)*snap_size);
3699
3700         return 0;
3701 }
3702
3703 static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
3704 {
3705         return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
3706                                         &rbd_dev->header.obj_order,
3707                                         &rbd_dev->header.image_size);
3708 }
3709
3710 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3711 {
3712         void *reply_buf;
3713         int ret;
3714         void *p;
3715
3716         reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
3717         if (!reply_buf)
3718                 return -ENOMEM;
3719
3720         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3721                                 "rbd", "get_object_prefix", NULL, 0,
3722                                 reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
3723         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3724         if (ret < 0)
3725                 goto out;
3726
3727         p = reply_buf;
3728         rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
3729                                                 p + ret, NULL, GFP_NOIO);
3730         ret = 0;
3731
3732         if (IS_ERR(rbd_dev->header.object_prefix)) {
3733                 ret = PTR_ERR(rbd_dev->header.object_prefix);
3734                 rbd_dev->header.object_prefix = NULL;
3735         } else {
3736                 dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
3737         }
3738 out:
3739         kfree(reply_buf);
3740
3741         return ret;
3742 }
3743
3744 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3745                 u64 *snap_features)
3746 {
3747         __le64 snapid = cpu_to_le64(snap_id);
3748         struct {
3749                 __le64 features;
3750                 __le64 incompat;
3751         } __attribute__ ((packed)) features_buf = { 0 };
3752         u64 incompat;
3753         int ret;
3754
3755         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3756                                 "rbd", "get_features",
3757                                 &snapid, sizeof (snapid),
3758                                 &features_buf, sizeof (features_buf));
3759         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3760         if (ret < 0)
3761                 return ret;
3762         if (ret < sizeof (features_buf))
3763                 return -ERANGE;
3764
3765         incompat = le64_to_cpu(features_buf.incompat);
3766         if (incompat & ~RBD_FEATURES_SUPPORTED)
3767                 return -ENXIO;
3768
3769         *snap_features = le64_to_cpu(features_buf.features);
3770
3771         dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3772                 (unsigned long long)snap_id,
3773                 (unsigned long long)*snap_features,
3774                 (unsigned long long)le64_to_cpu(features_buf.incompat));
3775
3776         return 0;
3777 }
3778
3779 static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3780 {
3781         return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3782                                                 &rbd_dev->header.features);
3783 }
3784
3785 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3786 {
3787         struct rbd_spec *parent_spec;
3788         size_t size;
3789         void *reply_buf = NULL;
3790         __le64 snapid;
3791         void *p;
3792         void *end;
3793         u64 pool_id;
3794         char *image_id;
3795         u64 overlap;
3796         int ret;
3797
3798         parent_spec = rbd_spec_alloc();
3799         if (!parent_spec)
3800                 return -ENOMEM;
3801
3802         size = sizeof (__le64) +                                /* pool_id */
3803                 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +        /* image_id */
3804                 sizeof (__le64) +                               /* snap_id */
3805                 sizeof (__le64);                                /* overlap */
3806         reply_buf = kmalloc(size, GFP_KERNEL);
3807         if (!reply_buf) {
3808                 ret = -ENOMEM;
3809                 goto out_err;
3810         }
3811
3812         snapid = cpu_to_le64(CEPH_NOSNAP);
3813         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3814                                 "rbd", "get_parent",
3815                                 &snapid, sizeof (snapid),
3816                                 reply_buf, size);
3817         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3818         if (ret < 0)
3819                 goto out_err;
3820
3821         p = reply_buf;
3822         end = reply_buf + ret;
3823         ret = -ERANGE;
3824         ceph_decode_64_safe(&p, end, pool_id, out_err);
3825         if (pool_id == CEPH_NOPOOL) {
3826                 /*
3827                  * Either the parent never existed, or we have
3828                  * record of it but the image got flattened so it no
3829                  * longer has a parent.  When the parent of a
3830                  * layered image disappears we immediately set the
3831                  * overlap to 0.  The effect of this is that all new
3832                  * requests will be treated as if the image had no
3833                  * parent.
3834                  */
3835                 if (rbd_dev->parent_overlap) {
3836                         rbd_dev->parent_overlap = 0;
3837                         smp_mb();
3838                         rbd_dev_parent_put(rbd_dev);
3839                         pr_info("%s: clone image has been flattened\n",
3840                                 rbd_dev->disk->disk_name);
3841                 }
3842
3843                 goto out;       /* No parent?  No problem. */
3844         }
3845
3846         /* The ceph file layout needs to fit pool id in 32 bits */
3847
3848         ret = -EIO;
3849         if (pool_id > (u64)U32_MAX) {
3850                 rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3851                         (unsigned long long)pool_id, U32_MAX);
3852                 goto out_err;
3853         }
3854         parent_spec->pool_id = pool_id;
3855
3856         image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3857         if (IS_ERR(image_id)) {
3858                 ret = PTR_ERR(image_id);
3859                 goto out_err;
3860         }
3861         parent_spec->image_id = image_id;
3862         ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3863         ceph_decode_64_safe(&p, end, overlap, out_err);
3864
3865         if (overlap) {
3866                 rbd_spec_put(rbd_dev->parent_spec);
3867                 rbd_dev->parent_spec = parent_spec;
3868                 parent_spec = NULL;     /* rbd_dev now owns this */
3869                 rbd_dev->parent_overlap = overlap;
3870         } else {
3871                 rbd_warn(rbd_dev, "ignoring parent of clone with overlap 0\n");
3872         }
3873 out:
3874         ret = 0;
3875 out_err:
3876         kfree(reply_buf);
3877         rbd_spec_put(parent_spec);
3878
3879         return ret;
3880 }
3881
3882 static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3883 {
3884         struct {
3885                 __le64 stripe_unit;
3886                 __le64 stripe_count;
3887         } __attribute__ ((packed)) striping_info_buf = { 0 };
3888         size_t size = sizeof (striping_info_buf);
3889         void *p;
3890         u64 obj_size;
3891         u64 stripe_unit;
3892         u64 stripe_count;
3893         int ret;
3894
3895         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3896                                 "rbd", "get_stripe_unit_count", NULL, 0,
3897                                 (char *)&striping_info_buf, size);
3898         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3899         if (ret < 0)
3900                 return ret;
3901         if (ret < size)
3902                 return -ERANGE;
3903
3904         /*
3905          * We don't actually support the "fancy striping" feature
3906          * (STRIPINGV2) yet, but if the striping sizes are the
3907          * defaults the behavior is the same as before.  So find
3908          * out, and only fail if the image has non-default values.
3909          */
3910         ret = -EINVAL;
3911         obj_size = (u64)1 << rbd_dev->header.obj_order;
3912         p = &striping_info_buf;
3913         stripe_unit = ceph_decode_64(&p);
3914         if (stripe_unit != obj_size) {
3915                 rbd_warn(rbd_dev, "unsupported stripe unit "
3916                                 "(got %llu want %llu)",
3917                                 stripe_unit, obj_size);
3918                 return -EINVAL;
3919         }
3920         stripe_count = ceph_decode_64(&p);
3921         if (stripe_count != 1) {
3922                 rbd_warn(rbd_dev, "unsupported stripe count "
3923                                 "(got %llu want 1)", stripe_count);
3924                 return -EINVAL;
3925         }
3926         rbd_dev->header.stripe_unit = stripe_unit;
3927         rbd_dev->header.stripe_count = stripe_count;
3928
3929         return 0;
3930 }
3931
3932 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3933 {
3934         size_t image_id_size;
3935         char *image_id;
3936         void *p;
3937         void *end;
3938         size_t size;
3939         void *reply_buf = NULL;
3940         size_t len = 0;
3941         char *image_name = NULL;
3942         int ret;
3943
3944         rbd_assert(!rbd_dev->spec->image_name);
3945
3946         len = strlen(rbd_dev->spec->image_id);
3947         image_id_size = sizeof (__le32) + len;
3948         image_id = kmalloc(image_id_size, GFP_KERNEL);
3949         if (!image_id)
3950                 return NULL;
3951
3952         p = image_id;
3953         end = image_id + image_id_size;
3954         ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
3955
3956         size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3957         reply_buf = kmalloc(size, GFP_KERNEL);
3958         if (!reply_buf)
3959                 goto out;
3960
3961         ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
3962                                 "rbd", "dir_get_name",
3963                                 image_id, image_id_size,
3964                                 reply_buf, size);
3965         if (ret < 0)
3966                 goto out;
3967         p = reply_buf;
3968         end = reply_buf + ret;
3969
3970         image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3971         if (IS_ERR(image_name))
3972                 image_name = NULL;
3973         else
3974                 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3975 out:
3976         kfree(reply_buf);
3977         kfree(image_id);
3978
3979         return image_name;
3980 }
3981
3982 static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
3983 {
3984         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3985         const char *snap_name;
3986         u32 which = 0;
3987
3988         /* Skip over names until we find the one we are looking for */
3989
3990         snap_name = rbd_dev->header.snap_names;
3991         while (which < snapc->num_snaps) {
3992                 if (!strcmp(name, snap_name))
3993                         return snapc->snaps[which];
3994                 snap_name += strlen(snap_name) + 1;
3995                 which++;
3996         }
3997         return CEPH_NOSNAP;
3998 }
3999
4000 static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4001 {
4002         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4003         u32 which;
4004         bool found = false;
4005         u64 snap_id;
4006
4007         for (which = 0; !found && which < snapc->num_snaps; which++) {
4008                 const char *snap_name;
4009
4010                 snap_id = snapc->snaps[which];
4011                 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
4012                 if (IS_ERR(snap_name))
4013                         break;
4014                 found = !strcmp(name, snap_name);
4015                 kfree(snap_name);
4016         }
4017         return found ? snap_id : CEPH_NOSNAP;
4018 }
4019
4020 /*
4021  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
4022  * no snapshot by that name is found, or if an error occurs.
4023  */
4024 static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4025 {
4026         if (rbd_dev->image_format == 1)
4027                 return rbd_v1_snap_id_by_name(rbd_dev, name);
4028
4029         return rbd_v2_snap_id_by_name(rbd_dev, name);
4030 }
4031
4032 /*
4033  * When an rbd image has a parent image, it is identified by the
4034  * pool, image, and snapshot ids (not names).  This function fills
4035  * in the names for those ids.  (It's OK if we can't figure out the
4036  * name for an image id, but the pool and snapshot ids should always
4037  * exist and have names.)  All names in an rbd spec are dynamically
4038  * allocated.
4039  *
4040  * When an image being mapped (not a parent) is probed, we have the
4041  * pool name and pool id, image name and image id, and the snapshot
4042  * name.  The only thing we're missing is the snapshot id.
4043  */
4044 static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
4045 {
4046         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4047         struct rbd_spec *spec = rbd_dev->spec;
4048         const char *pool_name;
4049         const char *image_name;
4050         const char *snap_name;
4051         int ret;
4052
4053         /*
4054          * An image being mapped will have the pool name (etc.), but
4055          * we need to look up the snapshot id.
4056          */
4057         if (spec->pool_name) {
4058                 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
4059                         u64 snap_id;
4060
4061                         snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
4062                         if (snap_id == CEPH_NOSNAP)
4063                                 return -ENOENT;
4064                         spec->snap_id = snap_id;
4065                 } else {
4066                         spec->snap_id = CEPH_NOSNAP;
4067                 }
4068
4069                 return 0;
4070         }
4071
4072         /* Get the pool name; we have to make our own copy of this */
4073
4074         pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
4075         if (!pool_name) {
4076                 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
4077                 return -EIO;
4078         }
4079         pool_name = kstrdup(pool_name, GFP_KERNEL);
4080         if (!pool_name)
4081                 return -ENOMEM;
4082
4083         /* Fetch the image name; tolerate failure here */
4084
4085         image_name = rbd_dev_image_name(rbd_dev);
4086         if (!image_name)
4087                 rbd_warn(rbd_dev, "unable to get image name");
4088
4089         /* Look up the snapshot name, and make a copy */
4090
4091         snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
4092         if (!snap_name) {
4093                 ret = -ENOMEM;
4094                 goto out_err;
4095         }
4096
4097         spec->pool_name = pool_name;
4098         spec->image_name = image_name;
4099         spec->snap_name = snap_name;
4100
4101         return 0;
4102 out_err:
4103         kfree(image_name);
4104         kfree(pool_name);
4105
4106         return ret;
4107 }
4108
4109 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
4110 {
4111         size_t size;
4112         int ret;
4113         void *reply_buf;
4114         void *p;
4115         void *end;
4116         u64 seq;
4117         u32 snap_count;
4118         struct ceph_snap_context *snapc;
4119         u32 i;
4120
4121         /*
4122          * We'll need room for the seq value (maximum snapshot id),
4123          * snapshot count, and array of that many snapshot ids.
4124          * For now we have a fixed upper limit on the number we're
4125          * prepared to receive.
4126          */
4127         size = sizeof (__le64) + sizeof (__le32) +
4128                         RBD_MAX_SNAP_COUNT * sizeof (__le64);
4129         reply_buf = kzalloc(size, GFP_KERNEL);
4130         if (!reply_buf)
4131                 return -ENOMEM;
4132
4133         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4134                                 "rbd", "get_snapcontext", NULL, 0,
4135                                 reply_buf, size);
4136         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4137         if (ret < 0)
4138                 goto out;
4139
4140         p = reply_buf;
4141         end = reply_buf + ret;
4142         ret = -ERANGE;
4143         ceph_decode_64_safe(&p, end, seq, out);
4144         ceph_decode_32_safe(&p, end, snap_count, out);
4145
4146         /*
4147          * Make sure the reported number of snapshot ids wouldn't go
4148          * beyond the end of our buffer.  But before checking that,
4149          * make sure the computed size of the snapshot context we
4150          * allocate is representable in a size_t.
4151          */
4152         if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
4153                                  / sizeof (u64)) {
4154                 ret = -EINVAL;
4155                 goto out;
4156         }
4157         if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
4158                 goto out;
4159         ret = 0;
4160
4161         snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
4162         if (!snapc) {
4163                 ret = -ENOMEM;
4164                 goto out;
4165         }
4166         snapc->seq = seq;
4167         for (i = 0; i < snap_count; i++)
4168                 snapc->snaps[i] = ceph_decode_64(&p);
4169
4170         ceph_put_snap_context(rbd_dev->header.snapc);
4171         rbd_dev->header.snapc = snapc;
4172
4173         dout("  snap context seq = %llu, snap_count = %u\n",
4174                 (unsigned long long)seq, (unsigned int)snap_count);
4175 out:
4176         kfree(reply_buf);
4177
4178         return ret;
4179 }
4180
4181 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4182                                         u64 snap_id)
4183 {
4184         size_t size;
4185         void *reply_buf;
4186         __le64 snapid;
4187         int ret;
4188         void *p;
4189         void *end;
4190         char *snap_name;
4191
4192         size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4193         reply_buf = kmalloc(size, GFP_KERNEL);
4194         if (!reply_buf)
4195                 return ERR_PTR(-ENOMEM);
4196
4197         snapid = cpu_to_le64(snap_id);
4198         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4199                                 "rbd", "get_snapshot_name",
4200                                 &snapid, sizeof (snapid),
4201                                 reply_buf, size);
4202         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4203         if (ret < 0) {
4204                 snap_name = ERR_PTR(ret);
4205                 goto out;
4206         }
4207
4208         p = reply_buf;
4209         end = reply_buf + ret;
4210         snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4211         if (IS_ERR(snap_name))
4212                 goto out;
4213
4214         dout("  snap_id 0x%016llx snap_name = %s\n",
4215                 (unsigned long long)snap_id, snap_name);
4216 out:
4217         kfree(reply_buf);
4218
4219         return snap_name;
4220 }
4221
4222 static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
4223 {
4224         bool first_time = rbd_dev->header.object_prefix == NULL;
4225         int ret;
4226
4227         down_write(&rbd_dev->header_rwsem);
4228
4229         if (first_time) {
4230                 ret = rbd_dev_v2_header_onetime(rbd_dev);
4231                 if (ret)
4232                         goto out;
4233         }
4234
4235         /*
4236          * If the image supports layering, get the parent info.  We
4237          * need to probe the first time regardless.  Thereafter we
4238          * only need to if there's a parent, to see if it has
4239          * disappeared due to the mapped image getting flattened.
4240          */
4241         if (rbd_dev->header.features & RBD_FEATURE_LAYERING &&
4242                         (first_time || rbd_dev->parent_spec)) {
4243                 bool warn;
4244
4245                 ret = rbd_dev_v2_parent_info(rbd_dev);
4246                 if (ret)
4247                         goto out;
4248
4249                 /*
4250                  * Print a warning if this is the initial probe and
4251                  * the image has a parent.  Don't print it if the
4252                  * image now being probed is itself a parent.  We
4253                  * can tell at this point because we won't know its
4254                  * pool name yet (just its pool id).
4255                  */
4256                 warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name;
4257                 if (first_time && warn)
4258                         rbd_warn(rbd_dev, "WARNING: kernel layering "
4259                                         "is EXPERIMENTAL!");
4260         }
4261
4262         ret = rbd_dev_v2_image_size(rbd_dev);
4263         if (ret)
4264                 goto out;
4265
4266         if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
4267                 if (rbd_dev->mapping.size != rbd_dev->header.image_size)
4268                         rbd_dev->mapping.size = rbd_dev->header.image_size;
4269
4270         ret = rbd_dev_v2_snap_context(rbd_dev);
4271         dout("rbd_dev_v2_snap_context returned %d\n", ret);
4272 out:
4273         up_write(&rbd_dev->header_rwsem);
4274
4275         return ret;
4276 }
4277
4278 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4279 {
4280         struct device *dev;
4281         int ret;
4282
4283         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4284
4285         dev = &rbd_dev->dev;
4286         dev->bus = &rbd_bus_type;
4287         dev->type = &rbd_device_type;
4288         dev->parent = &rbd_root_dev;
4289         dev->release = rbd_dev_device_release;
4290         dev_set_name(dev, "%d", rbd_dev->dev_id);
4291         ret = device_register(dev);
4292
4293         mutex_unlock(&ctl_mutex);
4294
4295         return ret;
4296 }
4297
4298 static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4299 {
4300         device_unregister(&rbd_dev->dev);
4301 }
4302
4303 static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
4304
4305 /*
4306  * Get a unique rbd identifier for the given new rbd_dev, and add
4307  * the rbd_dev to the global list.  The minimum rbd id is 1.
4308  */
4309 static void rbd_dev_id_get(struct rbd_device *rbd_dev)
4310 {
4311         rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
4312
4313         spin_lock(&rbd_dev_list_lock);
4314         list_add_tail(&rbd_dev->node, &rbd_dev_list);
4315         spin_unlock(&rbd_dev_list_lock);
4316         dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4317                 (unsigned long long) rbd_dev->dev_id);
4318 }
4319
4320 /*
4321  * Remove an rbd_dev from the global list, and record that its
4322  * identifier is no longer in use.
4323  */
4324 static void rbd_dev_id_put(struct rbd_device *rbd_dev)
4325 {
4326         struct list_head *tmp;
4327         int rbd_id = rbd_dev->dev_id;
4328         int max_id;
4329
4330         rbd_assert(rbd_id > 0);
4331
4332         dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4333                 (unsigned long long) rbd_dev->dev_id);
4334         spin_lock(&rbd_dev_list_lock);
4335         list_del_init(&rbd_dev->node);
4336
4337         /*
4338          * If the id being "put" is not the current maximum, there
4339          * is nothing special we need to do.
4340          */
4341         if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
4342                 spin_unlock(&rbd_dev_list_lock);
4343                 return;
4344         }
4345
4346         /*
4347          * We need to update the current maximum id.  Search the
4348          * list to find out what it is.  We're more likely to find
4349          * the maximum at the end, so search the list backward.
4350          */
4351         max_id = 0;
4352         list_for_each_prev(tmp, &rbd_dev_list) {
4353                 struct rbd_device *rbd_dev;
4354
4355                 rbd_dev = list_entry(tmp, struct rbd_device, node);
4356                 if (rbd_dev->dev_id > max_id)
4357                         max_id = rbd_dev->dev_id;
4358         }
4359         spin_unlock(&rbd_dev_list_lock);
4360
4361         /*
4362          * The max id could have been updated by rbd_dev_id_get(), in
4363          * which case it now accurately reflects the new maximum.
4364          * Be careful not to overwrite the maximum value in that
4365          * case.
4366          */
4367         atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4368         dout("  max dev id has been reset\n");
4369 }
4370
4371 /*
4372  * Skips over white space at *buf, and updates *buf to point to the
4373  * first found non-space character (if any). Returns the length of
4374  * the token (string of non-white space characters) found.  Note
4375  * that *buf must be terminated with '\0'.
4376  */
4377 static inline size_t next_token(const char **buf)
4378 {
4379         /*
4380         * These are the characters that produce nonzero for
4381         * isspace() in the "C" and "POSIX" locales.
4382         */
4383         const char *spaces = " \f\n\r\t\v";
4384
4385         *buf += strspn(*buf, spaces);   /* Find start of token */
4386
4387         return strcspn(*buf, spaces);   /* Return token length */
4388 }
4389
4390 /*
4391  * Finds the next token in *buf, and if the provided token buffer is
4392  * big enough, copies the found token into it.  The result, if
4393  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4394  * must be terminated with '\0' on entry.
4395  *
4396  * Returns the length of the token found (not including the '\0').
4397  * Return value will be 0 if no token is found, and it will be >=
4398  * token_size if the token would not fit.
4399  *
4400  * The *buf pointer will be updated to point beyond the end of the
4401  * found token.  Note that this occurs even if the token buffer is
4402  * too small to hold it.
4403  */
4404 static inline size_t copy_token(const char **buf,
4405                                 char *token,
4406                                 size_t token_size)
4407 {
4408         size_t len;
4409
4410         len = next_token(buf);
4411         if (len < token_size) {
4412                 memcpy(token, *buf, len);
4413                 *(token + len) = '\0';
4414         }
4415         *buf += len;
4416
4417         return len;
4418 }
4419
4420 /*
4421  * Finds the next token in *buf, dynamically allocates a buffer big
4422  * enough to hold a copy of it, and copies the token into the new
4423  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4424  * that a duplicate buffer is created even for a zero-length token.
4425  *
4426  * Returns a pointer to the newly-allocated duplicate, or a null
4427  * pointer if memory for the duplicate was not available.  If
4428  * the lenp argument is a non-null pointer, the length of the token
4429  * (not including the '\0') is returned in *lenp.
4430  *
4431  * If successful, the *buf pointer will be updated to point beyond
4432  * the end of the found token.
4433  *
4434  * Note: uses GFP_KERNEL for allocation.
4435  */
4436 static inline char *dup_token(const char **buf, size_t *lenp)
4437 {
4438         char *dup;
4439         size_t len;
4440
4441         len = next_token(buf);
4442         dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4443         if (!dup)
4444                 return NULL;
4445         *(dup + len) = '\0';
4446         *buf += len;
4447
4448         if (lenp)
4449                 *lenp = len;
4450
4451         return dup;
4452 }
4453
4454 /*
4455  * Parse the options provided for an "rbd add" (i.e., rbd image
4456  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4457  * and the data written is passed here via a NUL-terminated buffer.
4458  * Returns 0 if successful or an error code otherwise.
4459  *
4460  * The information extracted from these options is recorded in
4461  * the other parameters which return dynamically-allocated
4462  * structures:
4463  *  ceph_opts
4464  *      The address of a pointer that will refer to a ceph options
4465  *      structure.  Caller must release the returned pointer using
4466  *      ceph_destroy_options() when it is no longer needed.
4467  *  rbd_opts
4468  *      Address of an rbd options pointer.  Fully initialized by
4469  *      this function; caller must release with kfree().
4470  *  spec
4471  *      Address of an rbd image specification pointer.  Fully
4472  *      initialized by this function based on parsed options.
4473  *      Caller must release with rbd_spec_put().
4474  *
4475  * The options passed take this form:
4476  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4477  * where:
4478  *  <mon_addrs>
4479  *      A comma-separated list of one or more monitor addresses.
4480  *      A monitor address is an ip address, optionally followed
4481  *      by a port number (separated by a colon).
4482  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4483  *  <options>
4484  *      A comma-separated list of ceph and/or rbd options.
4485  *  <pool_name>
4486  *      The name of the rados pool containing the rbd image.
4487  *  <image_name>
4488  *      The name of the image in that pool to map.
4489  *  <snap_id>
4490  *      An optional snapshot id.  If provided, the mapping will
4491  *      present data from the image at the time that snapshot was
4492  *      created.  The image head is used if no snapshot id is
4493  *      provided.  Snapshot mappings are always read-only.
4494  */
4495 static int rbd_add_parse_args(const char *buf,
4496                                 struct ceph_options **ceph_opts,
4497                                 struct rbd_options **opts,
4498                                 struct rbd_spec **rbd_spec)
4499 {
4500         size_t len;
4501         char *options;
4502         const char *mon_addrs;
4503         char *snap_name;
4504         size_t mon_addrs_size;
4505         struct rbd_spec *spec = NULL;
4506         struct rbd_options *rbd_opts = NULL;
4507         struct ceph_options *copts;
4508         int ret;
4509
4510         /* The first four tokens are required */
4511
4512         len = next_token(&buf);
4513         if (!len) {
4514                 rbd_warn(NULL, "no monitor address(es) provided");
4515                 return -EINVAL;
4516         }
4517         mon_addrs = buf;
4518         mon_addrs_size = len + 1;
4519         buf += len;
4520
4521         ret = -EINVAL;
4522         options = dup_token(&buf, NULL);
4523         if (!options)
4524                 return -ENOMEM;
4525         if (!*options) {
4526                 rbd_warn(NULL, "no options provided");
4527                 goto out_err;
4528         }
4529
4530         spec = rbd_spec_alloc();
4531         if (!spec)
4532                 goto out_mem;
4533
4534         spec->pool_name = dup_token(&buf, NULL);
4535         if (!spec->pool_name)
4536                 goto out_mem;
4537         if (!*spec->pool_name) {
4538                 rbd_warn(NULL, "no pool name provided");
4539                 goto out_err;
4540         }
4541
4542         spec->image_name = dup_token(&buf, NULL);
4543         if (!spec->image_name)
4544                 goto out_mem;
4545         if (!*spec->image_name) {
4546                 rbd_warn(NULL, "no image name provided");
4547                 goto out_err;
4548         }
4549
4550         /*
4551          * Snapshot name is optional; default is to use "-"
4552          * (indicating the head/no snapshot).
4553          */
4554         len = next_token(&buf);
4555         if (!len) {
4556                 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
4557                 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4558         } else if (len > RBD_MAX_SNAP_NAME_LEN) {
4559                 ret = -ENAMETOOLONG;
4560                 goto out_err;
4561         }
4562         snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4563         if (!snap_name)
4564                 goto out_mem;
4565         *(snap_name + len) = '\0';
4566         spec->snap_name = snap_name;
4567
4568         /* Initialize all rbd options to the defaults */
4569
4570         rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
4571         if (!rbd_opts)
4572                 goto out_mem;
4573
4574         rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4575
4576         copts = ceph_parse_options(options, mon_addrs,
4577                                         mon_addrs + mon_addrs_size - 1,
4578                                         parse_rbd_opts_token, rbd_opts);
4579         if (IS_ERR(copts)) {
4580                 ret = PTR_ERR(copts);
4581                 goto out_err;
4582         }
4583         kfree(options);
4584
4585         *ceph_opts = copts;
4586         *opts = rbd_opts;
4587         *rbd_spec = spec;
4588
4589         return 0;
4590 out_mem:
4591         ret = -ENOMEM;
4592 out_err:
4593         kfree(rbd_opts);
4594         rbd_spec_put(spec);
4595         kfree(options);
4596
4597         return ret;
4598 }
4599
4600 /*
4601  * An rbd format 2 image has a unique identifier, distinct from the
4602  * name given to it by the user.  Internally, that identifier is
4603  * what's used to specify the names of objects related to the image.
4604  *
4605  * A special "rbd id" object is used to map an rbd image name to its
4606  * id.  If that object doesn't exist, then there is no v2 rbd image
4607  * with the supplied name.
4608  *
4609  * This function will record the given rbd_dev's image_id field if
4610  * it can be determined, and in that case will return 0.  If any
4611  * errors occur a negative errno will be returned and the rbd_dev's
4612  * image_id field will be unchanged (and should be NULL).
4613  */
4614 static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4615 {
4616         int ret;
4617         size_t size;
4618         char *object_name;
4619         void *response;
4620         char *image_id;
4621
4622         /*
4623          * When probing a parent image, the image id is already
4624          * known (and the image name likely is not).  There's no
4625          * need to fetch the image id again in this case.  We
4626          * do still need to set the image format though.
4627          */
4628         if (rbd_dev->spec->image_id) {
4629                 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4630
4631                 return 0;
4632         }
4633
4634         /*
4635          * First, see if the format 2 image id file exists, and if
4636          * so, get the image's persistent id from it.
4637          */
4638         size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4639         object_name = kmalloc(size, GFP_NOIO);
4640         if (!object_name)
4641                 return -ENOMEM;
4642         sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4643         dout("rbd id object name is %s\n", object_name);
4644
4645         /* Response will be an encoded string, which includes a length */
4646
4647         size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4648         response = kzalloc(size, GFP_NOIO);
4649         if (!response) {
4650                 ret = -ENOMEM;
4651                 goto out;
4652         }
4653
4654         /* If it doesn't exist we'll assume it's a format 1 image */
4655
4656         ret = rbd_obj_method_sync(rbd_dev, object_name,
4657                                 "rbd", "get_id", NULL, 0,
4658                                 response, RBD_IMAGE_ID_LEN_MAX);
4659         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4660         if (ret == -ENOENT) {
4661                 image_id = kstrdup("", GFP_KERNEL);
4662                 ret = image_id ? 0 : -ENOMEM;
4663                 if (!ret)
4664                         rbd_dev->image_format = 1;
4665         } else if (ret > sizeof (__le32)) {
4666                 void *p = response;
4667
4668                 image_id = ceph_extract_encoded_string(&p, p + ret,
4669                                                 NULL, GFP_NOIO);
4670                 ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4671                 if (!ret)
4672                         rbd_dev->image_format = 2;
4673         } else {
4674                 ret = -EINVAL;
4675         }
4676
4677         if (!ret) {
4678                 rbd_dev->spec->image_id = image_id;
4679                 dout("image_id is %s\n", image_id);
4680         }
4681 out:
4682         kfree(response);
4683         kfree(object_name);
4684
4685         return ret;
4686 }
4687
4688 /* Undo whatever state changes are made by v1 or v2 image probe */
4689
4690 static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
4691 {
4692         struct rbd_image_header *header;
4693
4694         /* Drop parent reference unless it's already been done (or none) */
4695
4696         if (rbd_dev->parent_overlap)
4697                 rbd_dev_parent_put(rbd_dev);
4698
4699         /* Free dynamic fields from the header, then zero it out */
4700
4701         header = &rbd_dev->header;
4702         ceph_put_snap_context(header->snapc);
4703         kfree(header->snap_sizes);
4704         kfree(header->snap_names);
4705         kfree(header->object_prefix);
4706         memset(header, 0, sizeof (*header));
4707 }
4708
4709 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
4710 {
4711         int ret;
4712
4713         ret = rbd_dev_v2_object_prefix(rbd_dev);
4714         if (ret)
4715                 goto out_err;
4716
4717         /*
4718          * Get the and check features for the image.  Currently the
4719          * features are assumed to never change.
4720          */
4721         ret = rbd_dev_v2_features(rbd_dev);
4722         if (ret)
4723                 goto out_err;
4724
4725         /* If the image supports fancy striping, get its parameters */
4726
4727         if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4728                 ret = rbd_dev_v2_striping_info(rbd_dev);
4729                 if (ret < 0)
4730                         goto out_err;
4731         }
4732         /* No support for crypto and compression type format 2 images */
4733
4734         return 0;
4735 out_err:
4736         rbd_dev->header.features = 0;
4737         kfree(rbd_dev->header.object_prefix);
4738         rbd_dev->header.object_prefix = NULL;
4739
4740         return ret;
4741 }
4742
4743 static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
4744 {
4745         struct rbd_device *parent = NULL;
4746         struct rbd_spec *parent_spec;
4747         struct rbd_client *rbdc;
4748         int ret;
4749
4750         if (!rbd_dev->parent_spec)
4751                 return 0;
4752         /*
4753          * We need to pass a reference to the client and the parent
4754          * spec when creating the parent rbd_dev.  Images related by
4755          * parent/child relationships always share both.
4756          */
4757         parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4758         rbdc = __rbd_get_client(rbd_dev->rbd_client);
4759
4760         ret = -ENOMEM;
4761         parent = rbd_dev_create(rbdc, parent_spec);
4762         if (!parent)
4763                 goto out_err;
4764
4765         ret = rbd_dev_image_probe(parent, false);
4766         if (ret < 0)
4767                 goto out_err;
4768         rbd_dev->parent = parent;
4769         atomic_set(&rbd_dev->parent_ref, 1);
4770
4771         return 0;
4772 out_err:
4773         if (parent) {
4774                 rbd_dev_unparent(rbd_dev);
4775                 kfree(rbd_dev->header_name);
4776                 rbd_dev_destroy(parent);
4777         } else {
4778                 rbd_put_client(rbdc);
4779                 rbd_spec_put(parent_spec);
4780         }
4781
4782         return ret;
4783 }
4784
4785 static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
4786 {
4787         int ret;
4788
4789         /* generate unique id: find highest unique id, add one */
4790         rbd_dev_id_get(rbd_dev);
4791
4792         /* Fill in the device name, now that we have its id. */
4793         BUILD_BUG_ON(DEV_NAME_LEN
4794                         < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4795         sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4796
4797         /* Get our block major device number. */
4798
4799         ret = register_blkdev(0, rbd_dev->name);
4800         if (ret < 0)
4801                 goto err_out_id;
4802         rbd_dev->major = ret;
4803
4804         /* Set up the blkdev mapping. */
4805
4806         ret = rbd_init_disk(rbd_dev);
4807         if (ret)
4808                 goto err_out_blkdev;
4809
4810         ret = rbd_dev_mapping_set(rbd_dev);
4811         if (ret)
4812                 goto err_out_disk;
4813         set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
4814
4815         ret = rbd_bus_add_dev(rbd_dev);
4816         if (ret)
4817                 goto err_out_mapping;
4818
4819         /* Everything's ready.  Announce the disk to the world. */
4820
4821         set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4822         add_disk(rbd_dev->disk);
4823
4824         pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4825                 (unsigned long long) rbd_dev->mapping.size);
4826
4827         return ret;
4828
4829 err_out_mapping:
4830         rbd_dev_mapping_clear(rbd_dev);
4831 err_out_disk:
4832         rbd_free_disk(rbd_dev);
4833 err_out_blkdev:
4834         unregister_blkdev(rbd_dev->major, rbd_dev->name);
4835 err_out_id:
4836         rbd_dev_id_put(rbd_dev);
4837         rbd_dev_mapping_clear(rbd_dev);
4838
4839         return ret;
4840 }
4841
4842 static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4843 {
4844         struct rbd_spec *spec = rbd_dev->spec;
4845         size_t size;
4846
4847         /* Record the header object name for this rbd image. */
4848
4849         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4850
4851         if (rbd_dev->image_format == 1)
4852                 size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4853         else
4854                 size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4855
4856         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4857         if (!rbd_dev->header_name)
4858                 return -ENOMEM;
4859
4860         if (rbd_dev->image_format == 1)
4861                 sprintf(rbd_dev->header_name, "%s%s",
4862                         spec->image_name, RBD_SUFFIX);
4863         else
4864                 sprintf(rbd_dev->header_name, "%s%s",
4865                         RBD_HEADER_PREFIX, spec->image_id);
4866         return 0;
4867 }
4868
4869 static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4870 {
4871         rbd_dev_unprobe(rbd_dev);
4872         kfree(rbd_dev->header_name);
4873         rbd_dev->header_name = NULL;
4874         rbd_dev->image_format = 0;
4875         kfree(rbd_dev->spec->image_id);
4876         rbd_dev->spec->image_id = NULL;
4877
4878         rbd_dev_destroy(rbd_dev);
4879 }
4880
4881 /*
4882  * Probe for the existence of the header object for the given rbd
4883  * device.  If this image is the one being mapped (i.e., not a
4884  * parent), initiate a watch on its header object before using that
4885  * object to get detailed information about the rbd image.
4886  */
4887 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
4888 {
4889         int ret;
4890         int tmp;
4891
4892         /*
4893          * Get the id from the image id object.  If it's not a
4894          * format 2 image, we'll get ENOENT back, and we'll assume
4895          * it's a format 1 image.
4896          */
4897         ret = rbd_dev_image_id(rbd_dev);
4898         if (ret)
4899                 return ret;
4900         rbd_assert(rbd_dev->spec->image_id);
4901         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4902
4903         ret = rbd_dev_header_name(rbd_dev);
4904         if (ret)
4905                 goto err_out_format;
4906
4907         if (mapping) {
4908                 ret = rbd_dev_header_watch_sync(rbd_dev, true);
4909                 if (ret)
4910                         goto out_header_name;
4911         }
4912
4913         if (rbd_dev->image_format == 1)
4914                 ret = rbd_dev_v1_header_info(rbd_dev);
4915         else
4916                 ret = rbd_dev_v2_header_info(rbd_dev);
4917         if (ret)
4918                 goto err_out_watch;
4919
4920         ret = rbd_dev_spec_update(rbd_dev);
4921         if (ret)
4922                 goto err_out_probe;
4923
4924         ret = rbd_dev_probe_parent(rbd_dev);
4925         if (ret)
4926                 goto err_out_probe;
4927
4928         dout("discovered format %u image, header name is %s\n",
4929                 rbd_dev->image_format, rbd_dev->header_name);
4930
4931         return 0;
4932 err_out_probe:
4933         rbd_dev_unprobe(rbd_dev);
4934 err_out_watch:
4935         if (mapping) {
4936                 tmp = rbd_dev_header_watch_sync(rbd_dev, false);
4937                 if (tmp)
4938                         rbd_warn(rbd_dev, "unable to tear down "
4939                                         "watch request (%d)\n", tmp);
4940         }
4941 out_header_name:
4942         kfree(rbd_dev->header_name);
4943         rbd_dev->header_name = NULL;
4944 err_out_format:
4945         rbd_dev->image_format = 0;
4946         kfree(rbd_dev->spec->image_id);
4947         rbd_dev->spec->image_id = NULL;
4948
4949         dout("probe failed, returning %d\n", ret);
4950
4951         return ret;
4952 }
4953
4954 static ssize_t rbd_add(struct bus_type *bus,
4955                        const char *buf,
4956                        size_t count)
4957 {
4958         struct rbd_device *rbd_dev = NULL;
4959         struct ceph_options *ceph_opts = NULL;
4960         struct rbd_options *rbd_opts = NULL;
4961         struct rbd_spec *spec = NULL;
4962         struct rbd_client *rbdc;
4963         struct ceph_osd_client *osdc;
4964         bool read_only;
4965         int rc = -ENOMEM;
4966
4967         if (!try_module_get(THIS_MODULE))
4968                 return -ENODEV;
4969
4970         /* parse add command */
4971         rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4972         if (rc < 0)
4973                 goto err_out_module;
4974         read_only = rbd_opts->read_only;
4975         kfree(rbd_opts);
4976         rbd_opts = NULL;        /* done with this */
4977
4978         rbdc = rbd_get_client(ceph_opts);
4979         if (IS_ERR(rbdc)) {
4980                 rc = PTR_ERR(rbdc);
4981                 goto err_out_args;
4982         }
4983         ceph_opts = NULL;       /* rbd_dev client now owns this */
4984
4985         /* pick the pool */
4986         osdc = &rbdc->client->osdc;
4987         rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4988         if (rc < 0)
4989                 goto err_out_client;
4990         spec->pool_id = (u64)rc;
4991
4992         /* The ceph file layout needs to fit pool id in 32 bits */
4993
4994         if (spec->pool_id > (u64)U32_MAX) {
4995                 rbd_warn(NULL, "pool id too large (%llu > %u)\n",
4996                                 (unsigned long long)spec->pool_id, U32_MAX);
4997                 rc = -EIO;
4998                 goto err_out_client;
4999         }
5000
5001         rbd_dev = rbd_dev_create(rbdc, spec);
5002         if (!rbd_dev)
5003                 goto err_out_client;
5004         rbdc = NULL;            /* rbd_dev now owns this */
5005         spec = NULL;            /* rbd_dev now owns this */
5006
5007         rc = rbd_dev_image_probe(rbd_dev, true);
5008         if (rc < 0)
5009                 goto err_out_rbd_dev;
5010
5011         /* If we are mapping a snapshot it must be marked read-only */
5012
5013         if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
5014                 read_only = true;
5015         rbd_dev->mapping.read_only = read_only;
5016
5017         rc = rbd_dev_device_setup(rbd_dev);
5018         if (!rc)
5019                 return count;
5020
5021         rbd_dev_image_release(rbd_dev);
5022 err_out_rbd_dev:
5023         rbd_dev_destroy(rbd_dev);
5024 err_out_client:
5025         rbd_put_client(rbdc);
5026 err_out_args:
5027         if (ceph_opts)
5028                 ceph_destroy_options(ceph_opts);
5029         kfree(rbd_opts);
5030         rbd_spec_put(spec);
5031 err_out_module:
5032         module_put(THIS_MODULE);
5033
5034         dout("Error adding device %s\n", buf);
5035
5036         return (ssize_t)rc;
5037 }
5038
5039 static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
5040 {
5041         struct list_head *tmp;
5042         struct rbd_device *rbd_dev;
5043
5044         spin_lock(&rbd_dev_list_lock);
5045         list_for_each(tmp, &rbd_dev_list) {
5046                 rbd_dev = list_entry(tmp, struct rbd_device, node);
5047                 if (rbd_dev->dev_id == dev_id) {
5048                         spin_unlock(&rbd_dev_list_lock);
5049                         return rbd_dev;
5050                 }
5051         }
5052         spin_unlock(&rbd_dev_list_lock);
5053         return NULL;
5054 }
5055
5056 static void rbd_dev_device_release(struct device *dev)
5057 {
5058         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5059
5060         rbd_free_disk(rbd_dev);
5061         clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5062         rbd_dev_mapping_clear(rbd_dev);
5063         unregister_blkdev(rbd_dev->major, rbd_dev->name);
5064         rbd_dev->major = 0;
5065         rbd_dev_id_put(rbd_dev);
5066         rbd_dev_mapping_clear(rbd_dev);
5067 }
5068
5069 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5070 {
5071         while (rbd_dev->parent) {
5072                 struct rbd_device *first = rbd_dev;
5073                 struct rbd_device *second = first->parent;
5074                 struct rbd_device *third;
5075
5076                 /*
5077                  * Follow to the parent with no grandparent and
5078                  * remove it.
5079                  */
5080                 while (second && (third = second->parent)) {
5081                         first = second;
5082                         second = third;
5083                 }
5084                 rbd_assert(second);
5085                 rbd_dev_image_release(second);
5086                 first->parent = NULL;
5087                 first->parent_overlap = 0;
5088
5089                 rbd_assert(first->parent_spec);
5090                 rbd_spec_put(first->parent_spec);
5091                 first->parent_spec = NULL;
5092         }
5093 }
5094
5095 static ssize_t rbd_remove(struct bus_type *bus,
5096                           const char *buf,
5097                           size_t count)
5098 {
5099         struct rbd_device *rbd_dev = NULL;
5100         int target_id;
5101         unsigned long ul;
5102         int ret;
5103
5104         ret = strict_strtoul(buf, 10, &ul);
5105         if (ret)
5106                 return ret;
5107
5108         /* convert to int; abort if we lost anything in the conversion */
5109         target_id = (int) ul;
5110         if (target_id != ul)
5111                 return -EINVAL;
5112
5113         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
5114
5115         rbd_dev = __rbd_get_dev(target_id);
5116         if (!rbd_dev) {
5117                 ret = -ENOENT;
5118                 goto done;
5119         }
5120
5121         spin_lock_irq(&rbd_dev->lock);
5122         if (rbd_dev->open_count)
5123                 ret = -EBUSY;
5124         else
5125                 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
5126         spin_unlock_irq(&rbd_dev->lock);
5127         if (ret < 0)
5128                 goto done;
5129         rbd_bus_del_dev(rbd_dev);
5130         ret = rbd_dev_header_watch_sync(rbd_dev, false);
5131         if (ret)
5132                 rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
5133         rbd_dev_image_release(rbd_dev);
5134         module_put(THIS_MODULE);
5135         ret = count;
5136 done:
5137         mutex_unlock(&ctl_mutex);
5138
5139         return ret;
5140 }
5141
5142 /*
5143  * create control files in sysfs
5144  * /sys/bus/rbd/...
5145  */
5146 static int rbd_sysfs_init(void)
5147 {
5148         int ret;
5149
5150         ret = device_register(&rbd_root_dev);
5151         if (ret < 0)
5152                 return ret;
5153
5154         ret = bus_register(&rbd_bus_type);
5155         if (ret < 0)
5156                 device_unregister(&rbd_root_dev);
5157
5158         return ret;
5159 }
5160
5161 static void rbd_sysfs_cleanup(void)
5162 {
5163         bus_unregister(&rbd_bus_type);
5164         device_unregister(&rbd_root_dev);
5165 }
5166
5167 static int rbd_slab_init(void)
5168 {
5169         rbd_assert(!rbd_img_request_cache);
5170         rbd_img_request_cache = kmem_cache_create("rbd_img_request",
5171                                         sizeof (struct rbd_img_request),
5172                                         __alignof__(struct rbd_img_request),
5173                                         0, NULL);
5174         if (!rbd_img_request_cache)
5175                 return -ENOMEM;
5176
5177         rbd_assert(!rbd_obj_request_cache);
5178         rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5179                                         sizeof (struct rbd_obj_request),
5180                                         __alignof__(struct rbd_obj_request),
5181                                         0, NULL);
5182         if (!rbd_obj_request_cache)
5183                 goto out_err;
5184
5185         rbd_assert(!rbd_segment_name_cache);
5186         rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
5187                                         MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL);
5188         if (rbd_segment_name_cache)
5189                 return 0;
5190 out_err:
5191         if (rbd_obj_request_cache) {
5192                 kmem_cache_destroy(rbd_obj_request_cache);
5193                 rbd_obj_request_cache = NULL;
5194         }
5195
5196         kmem_cache_destroy(rbd_img_request_cache);
5197         rbd_img_request_cache = NULL;
5198
5199         return -ENOMEM;
5200 }
5201
5202 static void rbd_slab_exit(void)
5203 {
5204         rbd_assert(rbd_segment_name_cache);
5205         kmem_cache_destroy(rbd_segment_name_cache);
5206         rbd_segment_name_cache = NULL;
5207
5208         rbd_assert(rbd_obj_request_cache);
5209         kmem_cache_destroy(rbd_obj_request_cache);
5210         rbd_obj_request_cache = NULL;
5211
5212         rbd_assert(rbd_img_request_cache);
5213         kmem_cache_destroy(rbd_img_request_cache);
5214         rbd_img_request_cache = NULL;
5215 }
5216
5217 static int __init rbd_init(void)
5218 {
5219         int rc;
5220
5221         if (!libceph_compatible(NULL)) {
5222                 rbd_warn(NULL, "libceph incompatibility (quitting)");
5223
5224                 return -EINVAL;
5225         }
5226         rc = rbd_slab_init();
5227         if (rc)
5228                 return rc;
5229         rc = rbd_sysfs_init();
5230         if (rc)
5231                 rbd_slab_exit();
5232         else
5233                 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
5234
5235         return rc;
5236 }
5237
5238 static void __exit rbd_exit(void)
5239 {
5240         rbd_sysfs_cleanup();
5241         rbd_slab_exit();
5242 }
5243
5244 module_init(rbd_init);
5245 module_exit(rbd_exit);
5246
5247 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5248 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5249 MODULE_DESCRIPTION("rados block device");
5250
5251 /* following authorship retained from original osdblk.c */
5252 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5253
5254 MODULE_LICENSE("GPL");