Pileus Git - ~andy/linux/blob - drivers/block/rbd.c

   1
   2 /*
   3    rbd.c -- Export ceph rados objects as a Linux block device
   4
   5
   6    based on drivers/block/osdblk.c:
   7
   8    Copyright 2009 Red Hat, Inc.
   9
  10    This program is free software; you can redistribute it and/or modify
  11    it under the terms of the GNU General Public License as published by
  12    the Free Software Foundation.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program; see the file COPYING.  If not, write to
  21    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  22
  23
  24
  25    For usage instructions, please refer to:
  26
  27                  Documentation/ABI/testing/sysfs-bus-rbd
  28
  29  */
  30
  31 #include <linux/ceph/libceph.h>
  32 #include <linux/ceph/osd_client.h>
  33 #include <linux/ceph/mon_client.h>
  34 #include <linux/ceph/decode.h>
  35 #include <linux/parser.h>
  36 #include <linux/bsearch.h>
  37
  38 #include <linux/kernel.h>
  39 #include <linux/device.h>
  40 #include <linux/module.h>
  41 #include <linux/fs.h>
  42 #include <linux/blkdev.h>
  43 #include <linux/slab.h>
  44
  45 #include "rbd_types.h"
  46
  47 #define RBD_DEBUG       /* Activate rbd_assert() calls */
  48
  49 /*
  50  * The basic unit of block I/O is a sector.  It is interpreted in a
  51  * number of contexts in Linux (blk, bio, genhd), but the default is
  52  * universally 512 bytes.  These symbols are just slightly more
  53  * meaningful than the bare numbers they represent.
  54  */
  55 #define SECTOR_SHIFT    9
  56 #define SECTOR_SIZE     (1ULL << SECTOR_SHIFT)
  57
  58 #define RBD_DRV_NAME "rbd"
  59 #define RBD_DRV_NAME_LONG "rbd (rados block device)"
  60
  61 #define RBD_MINORS_PER_MAJOR    256             /* max minors per blkdev */
  62
  63 #define RBD_SNAP_DEV_NAME_PREFIX        "snap_"
  64 #define RBD_MAX_SNAP_NAME_LEN   \
  65                         (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
  66
  67 #define RBD_MAX_SNAP_COUNT      510     /* allows max snapc to fit in 4KB */
  68
  69 #define RBD_SNAP_HEAD_NAME      "-"
  70
  71 #define BAD_SNAP_INDEX  U32_MAX         /* invalid index into snap array */
  72
  73 /* This allows a single page to hold an image name sent by OSD */
  74 #define RBD_IMAGE_NAME_LEN_MAX  (PAGE_SIZE - sizeof (__le32) - 1)
  75 #define RBD_IMAGE_ID_LEN_MAX    64
  76
  77 #define RBD_OBJ_PREFIX_LEN_MAX  64
  78
  79 /* Feature bits */
  80
  81 #define RBD_FEATURE_LAYERING    (1<<0)
  82 #define RBD_FEATURE_STRIPINGV2  (1<<1)
  83 #define RBD_FEATURES_ALL \
  84             (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
  85
  86 /* Features supported by this (client software) implementation. */
  87
  88 #define RBD_FEATURES_SUPPORTED  (RBD_FEATURES_ALL)
  89
  90 /*
  91  * An RBD device name will be "rbd#", where the "rbd" comes from
  92  * RBD_DRV_NAME above, and # is a unique integer identifier.
  93  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
  94  * enough to hold all possible device names.
  95  */
  96 #define DEV_NAME_LEN            32
  97 #define MAX_INT_FORMAT_WIDTH    ((5 * sizeof (int)) / 2 + 1)
  98
  99 /*
 100  * block device image metadata (in-memory version)
 101  */
 102 struct rbd_image_header {
 103         /* These four fields never change for a given rbd image */
 104         char *object_prefix;
 105         u64 features;
 106         __u8 obj_order;
 107         __u8 crypt_type;
 108         __u8 comp_type;
 109
 110         /* The remaining fields need to be updated occasionally */
 111         u64 image_size;
 112         struct ceph_snap_context *snapc;
 113         char *snap_names;
 114         u64 *snap_sizes;
 115
 116         u64 stripe_unit;
 117         u64 stripe_count;
 118 };
 119
 120 /*
 121  * An rbd image specification.
 122  *
 123  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
 124  * identify an image.  Each rbd_dev structure includes a pointer to
 125  * an rbd_spec structure that encapsulates this identity.
 126  *
 127  * Each of the id's in an rbd_spec has an associated name.  For a
 128  * user-mapped image, the names are supplied and the id's associated
 129  * with them are looked up.  For a layered image, a parent image is
 130  * defined by the tuple, and the names are looked up.
 131  *
 132  * An rbd_dev structure contains a parent_spec pointer which is
 133  * non-null if the image it represents is a child in a layered
 134  * image.  This pointer will refer to the rbd_spec structure used
 135  * by the parent rbd_dev for its own identity (i.e., the structure
 136  * is shared between the parent and child).
 137  *
 138  * Since these structures are populated once, during the discovery
 139  * phase of image construction, they are effectively immutable so
 140  * we make no effort to synchronize access to them.
 141  *
 142  * Note that code herein does not assume the image name is known (it
 143  * could be a null pointer).
 144  */
 145 struct rbd_spec {
 146         u64             pool_id;
 147         const char      *pool_name;
 148
 149         const char      *image_id;
 150         const char      *image_name;
 151
 152         u64             snap_id;
 153         const char      *snap_name;
 154
 155         struct kref     kref;
 156 };
 157
 158 /*
 159  * an instance of the client.  multiple devices may share an rbd client.
 160  */
 161 struct rbd_client {
 162         struct ceph_client      *client;
 163         struct kref             kref;
 164         struct list_head        node;
 165 };
 166
 167 struct rbd_img_request;
 168 typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
 169
 170 #define BAD_WHICH       U32_MAX         /* Good which or bad which, which? */
 171
 172 struct rbd_obj_request;
 173 typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
 174
 175 enum obj_request_type {
 176         OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
 177 };
 178
 179 enum obj_req_flags {
 180         OBJ_REQ_DONE,           /* completion flag: not done = 0, done = 1 */
 181         OBJ_REQ_IMG_DATA,       /* object usage: standalone = 0, image = 1 */
 182         OBJ_REQ_KNOWN,          /* EXISTS flag valid: no = 0, yes = 1 */
 183         OBJ_REQ_EXISTS,         /* target exists: no = 0, yes = 1 */
 184 };
 185
 186 struct rbd_obj_request {
 187         const char              *object_name;
 188         u64                     offset;         /* object start byte */
 189         u64                     length;         /* bytes from offset */
 190         unsigned long           flags;
 191
 192         /*
 193          * An object request associated with an image will have its
 194          * img_data flag set; a standalone object request will not.
 195          *
 196          * A standalone object request will have which == BAD_WHICH
 197          * and a null obj_request pointer.
 198          *
 199          * An object request initiated in support of a layered image
 200          * object (to check for its existence before a write) will
 201          * have which == BAD_WHICH and a non-null obj_request pointer.
 202          *
 203          * Finally, an object request for rbd image data will have
 204          * which != BAD_WHICH, and will have a non-null img_request
 205          * pointer.  The value of which will be in the range
 206          * 0..(img_request->obj_request_count-1).
 207          */
 208         union {
 209                 struct rbd_obj_request  *obj_request;   /* STAT op */
 210                 struct {
 211                         struct rbd_img_request  *img_request;
 212                         u64                     img_offset;
 213                         /* links for img_request->obj_requests list */
 214                         struct list_head        links;
 215                 };
 216         };
 217         u32                     which;          /* posn image request list */
 218
 219         enum obj_request_type   type;
 220         union {
 221                 struct bio      *bio_list;
 222                 struct {
 223                         struct page     **pages;
 224                         u32             page_count;
 225                 };
 226         };
 227         struct page             **copyup_pages;
 228
 229         struct ceph_osd_request *osd_req;
 230
 231         u64                     xferred;        /* bytes transferred */
 232         int                     result;
 233
 234         rbd_obj_callback_t      callback;
 235         struct completion       completion;
 236
 237         struct kref             kref;
 238 };
 239
 240 enum img_req_flags {
 241         IMG_REQ_WRITE,          /* I/O direction: read = 0, write = 1 */
 242         IMG_REQ_CHILD,          /* initiator: block = 0, child image = 1 */
 243         IMG_REQ_LAYERED,        /* ENOENT handling: normal = 0, layered = 1 */
 244 };
 245
 246 struct rbd_img_request {
 247         struct rbd_device       *rbd_dev;
 248         u64                     offset; /* starting image byte offset */
 249         u64                     length; /* byte count from offset */
 250         unsigned long           flags;
 251         union {
 252                 u64                     snap_id;        /* for reads */
 253                 struct ceph_snap_context *snapc;        /* for writes */
 254         };
 255         union {
 256                 struct request          *rq;            /* block request */
 257                 struct rbd_obj_request  *obj_request;   /* obj req initiator */
 258         };
 259         struct page             **copyup_pages;
 260         spinlock_t              completion_lock;/* protects next_completion */
 261         u32                     next_completion;
 262         rbd_img_callback_t      callback;
 263         u64                     xferred;/* aggregate bytes transferred */
 264         int                     result; /* first nonzero obj_request result */
 265
 266         u32                     obj_request_count;
 267         struct list_head        obj_requests;   /* rbd_obj_request structs */
 268
 269         struct kref             kref;
 270 };
 271
 272 #define for_each_obj_request(ireq, oreq) \
 273         list_for_each_entry(oreq, &(ireq)->obj_requests, links)
 274 #define for_each_obj_request_from(ireq, oreq) \
 275         list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
 276 #define for_each_obj_request_safe(ireq, oreq, n) \
 277         list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
 278
 279 struct rbd_mapping {
 280         u64                     size;
 281         u64                     features;
 282         bool                    read_only;
 283 };
 284
 285 /*
 286  * a single device
 287  */
 288 struct rbd_device {
 289         int                     dev_id;         /* blkdev unique id */
 290
 291         int                     major;          /* blkdev assigned major */
 292         struct gendisk          *disk;          /* blkdev's gendisk and rq */
 293
 294         u32                     image_format;   /* Either 1 or 2 */
 295         struct rbd_client       *rbd_client;
 296
 297         char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 298
 299         spinlock_t              lock;           /* queue, flags, open_count */
 300
 301         struct rbd_image_header header;
 302         unsigned long           flags;          /* possibly lock protected */
 303         struct rbd_spec         *spec;
 304
 305         char                    *header_name;
 306
 307         struct ceph_file_layout layout;
 308
 309         struct ceph_osd_event   *watch_event;
 310         struct rbd_obj_request  *watch_request;
 311
 312         struct rbd_spec         *parent_spec;
 313         u64                     parent_overlap;
 314         struct rbd_device       *parent;
 315
 316         /* protects updating the header */
 317         struct rw_semaphore     header_rwsem;
 318
 319         struct rbd_mapping      mapping;
 320
 321         struct list_head        node;
 322
 323         /* sysfs related */
 324         struct device           dev;
 325         unsigned long           open_count;     /* protected by lock */
 326 };
 327
 328 /*
 329  * Flag bits for rbd_dev->flags.  If atomicity is required,
 330  * rbd_dev->lock is used to protect access.
 331  *
 332  * Currently, only the "removing" flag (which is coupled with the
 333  * "open_count" field) requires atomic access.
 334  */
 335 enum rbd_dev_flags {
 336         RBD_DEV_FLAG_EXISTS,    /* mapped snapshot has not been deleted */
 337         RBD_DEV_FLAG_REMOVING,  /* this mapping is being removed */
 338 };
 339
 340 static DEFINE_MUTEX(ctl_mutex);   /* Serialize open/close/setup/teardown */
 341
 342 static LIST_HEAD(rbd_dev_list);    /* devices */
 343 static DEFINE_SPINLOCK(rbd_dev_list_lock);
 344
 345 static LIST_HEAD(rbd_client_list);              /* clients */
 346 static DEFINE_SPINLOCK(rbd_client_list_lock);
 347
 348 /* Slab caches for frequently-allocated structures */
 349
 350 static struct kmem_cache        *rbd_img_request_cache;
 351 static struct kmem_cache        *rbd_obj_request_cache;
 352 static struct kmem_cache        *rbd_segment_name_cache;
 353
 354 static int rbd_img_request_submit(struct rbd_img_request *img_request);
 355
 356 static void rbd_dev_device_release(struct device *dev);
 357
 358 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
 359                        size_t count);
 360 static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
 361                           size_t count);
 362 static int rbd_dev_image_probe(struct rbd_device *rbd_dev);
 363
 364 static struct bus_attribute rbd_bus_attrs[] = {
 365         __ATTR(add, S_IWUSR, NULL, rbd_add),
 366         __ATTR(remove, S_IWUSR, NULL, rbd_remove),
 367         __ATTR_NULL
 368 };
 369
 370 static struct bus_type rbd_bus_type = {
 371         .name           = "rbd",
 372         .bus_attrs      = rbd_bus_attrs,
 373 };
 374
 375 static void rbd_root_dev_release(struct device *dev)
 376 {
 377 }
 378
 379 static struct device rbd_root_dev = {
 380         .init_name =    "rbd",
 381         .release =      rbd_root_dev_release,
 382 };
 383
 384 static __printf(2, 3)
 385 void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
 386 {
 387         struct va_format vaf;
 388         va_list args;
 389
 390         va_start(args, fmt);
 391         vaf.fmt = fmt;
 392         vaf.va = &args;
 393
 394         if (!rbd_dev)
 395                 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
 396         else if (rbd_dev->disk)
 397                 printk(KERN_WARNING "%s: %s: %pV\n",
 398                         RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
 399         else if (rbd_dev->spec && rbd_dev->spec->image_name)
 400                 printk(KERN_WARNING "%s: image %s: %pV\n",
 401                         RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
 402         else if (rbd_dev->spec && rbd_dev->spec->image_id)
 403                 printk(KERN_WARNING "%s: id %s: %pV\n",
 404                         RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
 405         else    /* punt */
 406                 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
 407                         RBD_DRV_NAME, rbd_dev, &vaf);
 408         va_end(args);
 409 }
 410
 411 #ifdef RBD_DEBUG
 412 #define rbd_assert(expr)                                                \
 413                 if (unlikely(!(expr))) {                                \
 414                         printk(KERN_ERR "\nAssertion failure in %s() "  \
 415                                                 "at line %d:\n\n"       \
 416                                         "\trbd_assert(%s);\n\n",        \
 417                                         __func__, __LINE__, #expr);     \
 418                         BUG();                                          \
 419                 }
 420 #else /* !RBD_DEBUG */
 421 #  define rbd_assert(expr)      ((void) 0)
 422 #endif /* !RBD_DEBUG */
 423
 424 static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
 425 static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
 426 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
 427
 428 static int rbd_dev_refresh(struct rbd_device *rbd_dev);
 429 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev);
 430 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
 431                                         u64 snap_id);
 432 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 433                                 u8 *order, u64 *snap_size);
 434 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 435                 u64 *snap_features);
 436 static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
 437
 438 static int rbd_open(struct block_device *bdev, fmode_t mode)
 439 {
 440         struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 441         bool removing = false;
 442
 443         if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
 444                 return -EROFS;
 445
 446         spin_lock_irq(&rbd_dev->lock);
 447         if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
 448                 removing = true;
 449         else
 450                 rbd_dev->open_count++;
 451         spin_unlock_irq(&rbd_dev->lock);
 452         if (removing)
 453                 return -ENOENT;
 454
 455         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 456         (void) get_device(&rbd_dev->dev);
 457         set_device_ro(bdev, rbd_dev->mapping.read_only);
 458         mutex_unlock(&ctl_mutex);
 459
 460         return 0;
 461 }
 462
 463 static int rbd_release(struct gendisk *disk, fmode_t mode)
 464 {
 465         struct rbd_device *rbd_dev = disk->private_data;
 466         unsigned long open_count_before;
 467
 468         spin_lock_irq(&rbd_dev->lock);
 469         open_count_before = rbd_dev->open_count--;
 470         spin_unlock_irq(&rbd_dev->lock);
 471         rbd_assert(open_count_before > 0);
 472
 473         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 474         put_device(&rbd_dev->dev);
 475         mutex_unlock(&ctl_mutex);
 476
 477         return 0;
 478 }
 479
 480 static const struct block_device_operations rbd_bd_ops = {
 481         .owner                  = THIS_MODULE,
 482         .open                   = rbd_open,
 483         .release                = rbd_release,
 484 };
 485
 486 /*
 487  * Initialize an rbd client instance.
 488  * We own *ceph_opts.
 489  */
 490 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 491 {
 492         struct rbd_client *rbdc;
 493         int ret = -ENOMEM;
 494
 495         dout("%s:\n", __func__);
 496         rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
 497         if (!rbdc)
 498                 goto out_opt;
 499
 500         kref_init(&rbdc->kref);
 501         INIT_LIST_HEAD(&rbdc->node);
 502
 503         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 504
 505         rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
 506         if (IS_ERR(rbdc->client))
 507                 goto out_mutex;
 508         ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
 509
 510         ret = ceph_open_session(rbdc->client);
 511         if (ret < 0)
 512                 goto out_err;
 513
 514         spin_lock(&rbd_client_list_lock);
 515         list_add_tail(&rbdc->node, &rbd_client_list);
 516         spin_unlock(&rbd_client_list_lock);
 517
 518         mutex_unlock(&ctl_mutex);
 519         dout("%s: rbdc %p\n", __func__, rbdc);
 520
 521         return rbdc;
 522
 523 out_err:
 524         ceph_destroy_client(rbdc->client);
 525 out_mutex:
 526         mutex_unlock(&ctl_mutex);
 527         kfree(rbdc);
 528 out_opt:
 529         if (ceph_opts)
 530                 ceph_destroy_options(ceph_opts);
 531         dout("%s: error %d\n", __func__, ret);
 532
 533         return ERR_PTR(ret);
 534 }
 535
 536 static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
 537 {
 538         kref_get(&rbdc->kref);
 539
 540         return rbdc;
 541 }
 542
 543 /*
 544  * Find a ceph client with specific addr and configuration.  If
 545  * found, bump its reference count.
 546  */
 547 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
 548 {
 549         struct rbd_client *client_node;
 550         bool found = false;
 551
 552         if (ceph_opts->flags & CEPH_OPT_NOSHARE)
 553                 return NULL;
 554
 555         spin_lock(&rbd_client_list_lock);
 556         list_for_each_entry(client_node, &rbd_client_list, node) {
 557                 if (!ceph_compare_options(ceph_opts, client_node->client)) {
 558                         __rbd_get_client(client_node);
 559
 560                         found = true;
 561                         break;
 562                 }
 563         }
 564         spin_unlock(&rbd_client_list_lock);
 565
 566         return found ? client_node : NULL;
 567 }
 568
 569 /*
 570  * mount options
 571  */
 572 enum {
 573         Opt_last_int,
 574         /* int args above */
 575         Opt_last_string,
 576         /* string args above */
 577         Opt_read_only,
 578         Opt_read_write,
 579         /* Boolean args above */
 580         Opt_last_bool,
 581 };
 582
 583 static match_table_t rbd_opts_tokens = {
 584         /* int args above */
 585         /* string args above */
 586         {Opt_read_only, "read_only"},
 587         {Opt_read_only, "ro"},          /* Alternate spelling */
 588         {Opt_read_write, "read_write"},
 589         {Opt_read_write, "rw"},         /* Alternate spelling */
 590         /* Boolean args above */
 591         {-1, NULL}
 592 };
 593
 594 struct rbd_options {
 595         bool    read_only;
 596 };
 597
 598 #define RBD_READ_ONLY_DEFAULT   false
 599
 600 static int parse_rbd_opts_token(char *c, void *private)
 601 {
 602         struct rbd_options *rbd_opts = private;
 603         substring_t argstr[MAX_OPT_ARGS];
 604         int token, intval, ret;
 605
 606         token = match_token(c, rbd_opts_tokens, argstr);
 607         if (token < 0)
 608                 return -EINVAL;
 609
 610         if (token < Opt_last_int) {
 611                 ret = match_int(&argstr[0], &intval);
 612                 if (ret < 0) {
 613                         pr_err("bad mount option arg (not int) "
 614                                "at '%s'\n", c);
 615                         return ret;
 616                 }
 617                 dout("got int token %d val %d\n", token, intval);
 618         } else if (token > Opt_last_int && token < Opt_last_string) {
 619                 dout("got string token %d val %s\n", token,
 620                      argstr[0].from);
 621         } else if (token > Opt_last_string && token < Opt_last_bool) {
 622                 dout("got Boolean token %d\n", token);
 623         } else {
 624                 dout("got token %d\n", token);
 625         }
 626
 627         switch (token) {
 628         case Opt_read_only:
 629                 rbd_opts->read_only = true;
 630                 break;
 631         case Opt_read_write:
 632                 rbd_opts->read_only = false;
 633                 break;
 634         default:
 635                 rbd_assert(false);
 636                 break;
 637         }
 638         return 0;
 639 }
 640
 641 /*
 642  * Get a ceph client with specific addr and configuration, if one does
 643  * not exist create it.
 644  */
 645 static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
 646 {
 647         struct rbd_client *rbdc;
 648
 649         rbdc = rbd_client_find(ceph_opts);
 650         if (rbdc)       /* using an existing client */
 651                 ceph_destroy_options(ceph_opts);
 652         else
 653                 rbdc = rbd_client_create(ceph_opts);
 654
 655         return rbdc;
 656 }
 657
 658 /*
 659  * Destroy ceph client
 660  *
 661  * Caller must hold rbd_client_list_lock.
 662  */
 663 static void rbd_client_release(struct kref *kref)
 664 {
 665         struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 666
 667         dout("%s: rbdc %p\n", __func__, rbdc);
 668         spin_lock(&rbd_client_list_lock);
 669         list_del(&rbdc->node);
 670         spin_unlock(&rbd_client_list_lock);
 671
 672         ceph_destroy_client(rbdc->client);
 673         kfree(rbdc);
 674 }
 675
 676 /*
 677  * Drop reference to ceph client node. If it's not referenced anymore, release
 678  * it.
 679  */
 680 static void rbd_put_client(struct rbd_client *rbdc)
 681 {
 682         if (rbdc)
 683                 kref_put(&rbdc->kref, rbd_client_release);
 684 }
 685
 686 static bool rbd_image_format_valid(u32 image_format)
 687 {
 688         return image_format == 1 || image_format == 2;
 689 }
 690
 691 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
 692 {
 693         size_t size;
 694         u32 snap_count;
 695
 696         /* The header has to start with the magic rbd header text */
 697         if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
 698                 return false;
 699
 700         /* The bio layer requires at least sector-sized I/O */
 701
 702         if (ondisk->options.order < SECTOR_SHIFT)
 703                 return false;
 704
 705         /* If we use u64 in a few spots we may be able to loosen this */
 706
 707         if (ondisk->options.order > 8 * sizeof (int) - 1)
 708                 return false;
 709
 710         /*
 711          * The size of a snapshot header has to fit in a size_t, and
 712          * that limits the number of snapshots.
 713          */
 714         snap_count = le32_to_cpu(ondisk->snap_count);
 715         size = SIZE_MAX - sizeof (struct ceph_snap_context);
 716         if (snap_count > size / sizeof (__le64))
 717                 return false;
 718
 719         /*
 720          * Not only that, but the size of the entire the snapshot
 721          * header must also be representable in a size_t.
 722          */
 723         size -= snap_count * sizeof (__le64);
 724         if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
 725                 return false;
 726
 727         return true;
 728 }
 729
 730 /*
 731  * Create a new header structure, translate header format from the on-disk
 732  * header.
 733  */
 734 static int rbd_header_from_disk(struct rbd_image_header *header,
 735                                  struct rbd_image_header_ondisk *ondisk)
 736 {
 737         u32 snap_count;
 738         size_t len;
 739         size_t size;
 740         u32 i;
 741
 742         memset(header, 0, sizeof (*header));
 743
 744         snap_count = le32_to_cpu(ondisk->snap_count);
 745
 746         len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
 747         header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
 748         if (!header->object_prefix)
 749                 return -ENOMEM;
 750         memcpy(header->object_prefix, ondisk->object_prefix, len);
 751         header->object_prefix[len] = '\0';
 752
 753         if (snap_count) {
 754                 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 755
 756                 /* Save a copy of the snapshot names */
 757
 758                 if (snap_names_len > (u64) SIZE_MAX)
 759                         return -EIO;
 760                 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
 761                 if (!header->snap_names)
 762                         goto out_err;
 763                 /*
 764                  * Note that rbd_dev_v1_header_read() guarantees
 765                  * the ondisk buffer we're working with has
 766                  * snap_names_len bytes beyond the end of the
 767                  * snapshot id array, this memcpy() is safe.
 768                  */
 769                 memcpy(header->snap_names, &ondisk->snaps[snap_count],
 770                         snap_names_len);
 771
 772                 /* Record each snapshot's size */
 773
 774                 size = snap_count * sizeof (*header->snap_sizes);
 775                 header->snap_sizes = kmalloc(size, GFP_KERNEL);
 776                 if (!header->snap_sizes)
 777                         goto out_err;
 778                 for (i = 0; i < snap_count; i++)
 779                         header->snap_sizes[i] =
 780                                 le64_to_cpu(ondisk->snaps[i].image_size);
 781         } else {
 782                 header->snap_names = NULL;
 783                 header->snap_sizes = NULL;
 784         }
 785
 786         header->features = 0;   /* No features support in v1 images */
 787         header->obj_order = ondisk->options.order;
 788         header->crypt_type = ondisk->options.crypt_type;
 789         header->comp_type = ondisk->options.comp_type;
 790
 791         /* Allocate and fill in the snapshot context */
 792
 793         header->image_size = le64_to_cpu(ondisk->image_size);
 794
 795         header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
 796         if (!header->snapc)
 797                 goto out_err;
 798         header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
 799         for (i = 0; i < snap_count; i++)
 800                 header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id);
 801
 802         return 0;
 803
 804 out_err:
 805         kfree(header->snap_sizes);
 806         header->snap_sizes = NULL;
 807         kfree(header->snap_names);
 808         header->snap_names = NULL;
 809         kfree(header->object_prefix);
 810         header->object_prefix = NULL;
 811
 812         return -ENOMEM;
 813 }
 814
 815 static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
 816 {
 817         const char *snap_name;
 818
 819         rbd_assert(which < rbd_dev->header.snapc->num_snaps);
 820
 821         /* Skip over names until we find the one we are looking for */
 822
 823         snap_name = rbd_dev->header.snap_names;
 824         while (which--)
 825                 snap_name += strlen(snap_name) + 1;
 826
 827         return kstrdup(snap_name, GFP_KERNEL);
 828 }
 829
 830 /*
 831  * Snapshot id comparison function for use with qsort()/bsearch().
 832  * Note that result is for snapshots in *descending* order.
 833  */
 834 static int snapid_compare_reverse(const void *s1, const void *s2)
 835 {
 836         u64 snap_id1 = *(u64 *)s1;
 837         u64 snap_id2 = *(u64 *)s2;
 838
 839         if (snap_id1 < snap_id2)
 840                 return 1;
 841         return snap_id1 == snap_id2 ? 0 : -1;
 842 }
 843
 844 /*
 845  * Search a snapshot context to see if the given snapshot id is
 846  * present.
 847  *
 848  * Returns the position of the snapshot id in the array if it's found,
 849  * or BAD_SNAP_INDEX otherwise.
 850  *
 851  * Note: The snapshot array is in kept sorted (by the osd) in
 852  * reverse order, highest snapshot id first.
 853  */
 854 static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
 855 {
 856         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
 857         u64 *found;
 858
 859         found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
 860                                 sizeof (snap_id), snapid_compare_reverse);
 861
 862         return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
 863 }
 864
 865 static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
 866                                         u64 snap_id)
 867 {
 868         u32 which;
 869
 870         which = rbd_dev_snap_index(rbd_dev, snap_id);
 871         if (which == BAD_SNAP_INDEX)
 872                 return NULL;
 873
 874         return _rbd_dev_v1_snap_name(rbd_dev, which);
 875 }
 876
 877 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
 878 {
 879         if (snap_id == CEPH_NOSNAP)
 880                 return RBD_SNAP_HEAD_NAME;
 881
 882         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 883         if (rbd_dev->image_format == 1)
 884                 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
 885
 886         return rbd_dev_v2_snap_name(rbd_dev, snap_id);
 887 }
 888
 889 static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 890                                 u64 *snap_size)
 891 {
 892         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 893         if (snap_id == CEPH_NOSNAP) {
 894                 *snap_size = rbd_dev->header.image_size;
 895         } else if (rbd_dev->image_format == 1) {
 896                 u32 which;
 897
 898                 which = rbd_dev_snap_index(rbd_dev, snap_id);
 899                 if (which == BAD_SNAP_INDEX)
 900                         return -ENOENT;
 901
 902                 *snap_size = rbd_dev->header.snap_sizes[which];
 903         } else {
 904                 u64 size = 0;
 905                 int ret;
 906
 907                 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
 908                 if (ret)
 909                         return ret;
 910
 911                 *snap_size = size;
 912         }
 913         return 0;
 914 }
 915
 916 static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 917                         u64 *snap_features)
 918 {
 919         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 920         if (snap_id == CEPH_NOSNAP) {
 921                 *snap_features = rbd_dev->header.features;
 922         } else if (rbd_dev->image_format == 1) {
 923                 *snap_features = 0;     /* No features for format 1 */
 924         } else {
 925                 u64 features = 0;
 926                 int ret;
 927
 928                 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
 929                 if (ret)
 930                         return ret;
 931
 932                 *snap_features = features;
 933         }
 934         return 0;
 935 }
 936
 937 static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
 938 {
 939         const char *snap_name = rbd_dev->spec->snap_name;
 940         u64 snap_id;
 941         u64 size = 0;
 942         u64 features = 0;
 943         int ret;
 944
 945         if (strcmp(snap_name, RBD_SNAP_HEAD_NAME)) {
 946                 snap_id = rbd_snap_id_by_name(rbd_dev, snap_name);
 947                 if (snap_id == CEPH_NOSNAP)
 948                         return -ENOENT;
 949         } else {
 950                 snap_id = CEPH_NOSNAP;
 951         }
 952
 953         ret = rbd_snap_size(rbd_dev, snap_id, &size);
 954         if (ret)
 955                 return ret;
 956         ret = rbd_snap_features(rbd_dev, snap_id, &features);
 957         if (ret)
 958                 return ret;
 959
 960         rbd_dev->mapping.size = size;
 961         rbd_dev->mapping.features = features;
 962
 963         /* If we are mapping a snapshot it must be marked read-only */
 964
 965         if (snap_id != CEPH_NOSNAP)
 966                 rbd_dev->mapping.read_only = true;
 967
 968         return 0;
 969 }
 970
 971 static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
 972 {
 973         rbd_dev->mapping.size = 0;
 974         rbd_dev->mapping.features = 0;
 975         rbd_dev->mapping.read_only = true;
 976 }
 977
 978 static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev)
 979 {
 980         rbd_dev->mapping.size = 0;
 981         rbd_dev->mapping.features = 0;
 982         rbd_dev->mapping.read_only = true;
 983 }
 984
 985 static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
 986 {
 987         char *name;
 988         u64 segment;
 989         int ret;
 990
 991         name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
 992         if (!name)
 993                 return NULL;
 994         segment = offset >> rbd_dev->header.obj_order;
 995         ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
 996                         rbd_dev->header.object_prefix, segment);
 997         if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
 998                 pr_err("error formatting segment name for #%llu (%d)\n",
 999                         segment, ret);
1000                 kfree(name);
1001                 name = NULL;
1002         }
1003
1004         return name;
1005 }
1006
1007 static void rbd_segment_name_free(const char *name)
1008 {
1009         /* The explicit cast here is needed to drop the const qualifier */
1010
1011         kmem_cache_free(rbd_segment_name_cache, (void *)name);
1012 }
1013
1014 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
1015 {
1016         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1017
1018         return offset & (segment_size - 1);
1019 }
1020
1021 static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1022                                 u64 offset, u64 length)
1023 {
1024         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1025
1026         offset &= segment_size - 1;
1027
1028         rbd_assert(length <= U64_MAX - offset);
1029         if (offset + length > segment_size)
1030                 length = segment_size - offset;
1031
1032         return length;
1033 }
1034
1035 /*
1036  * returns the size of an object in the image
1037  */
1038 static u64 rbd_obj_bytes(struct rbd_image_header *header)
1039 {
1040         return 1 << header->obj_order;
1041 }
1042
1043 /*
1044  * bio helpers
1045  */
1046
1047 static void bio_chain_put(struct bio *chain)
1048 {
1049         struct bio *tmp;
1050
1051         while (chain) {
1052                 tmp = chain;
1053                 chain = chain->bi_next;
1054                 bio_put(tmp);
1055         }
1056 }
1057
1058 /*
1059  * zeros a bio chain, starting at specific offset
1060  */
1061 static void zero_bio_chain(struct bio *chain, int start_ofs)
1062 {
1063         struct bio_vec *bv;
1064         unsigned long flags;
1065         void *buf;
1066         int i;
1067         int pos = 0;
1068
1069         while (chain) {
1070                 bio_for_each_segment(bv, chain, i) {
1071                         if (pos + bv->bv_len > start_ofs) {
1072                                 int remainder = max(start_ofs - pos, 0);
1073                                 buf = bvec_kmap_irq(bv, &flags);
1074                                 memset(buf + remainder, 0,
1075                                        bv->bv_len - remainder);
1076                                 bvec_kunmap_irq(buf, &flags);
1077                         }
1078                         pos += bv->bv_len;
1079                 }
1080
1081                 chain = chain->bi_next;
1082         }
1083 }
1084
1085 /*
1086  * similar to zero_bio_chain(), zeros data defined by a page array,
1087  * starting at the given byte offset from the start of the array and
1088  * continuing up to the given end offset.  The pages array is
1089  * assumed to be big enough to hold all bytes up to the end.
1090  */
1091 static void zero_pages(struct page **pages, u64 offset, u64 end)
1092 {
1093         struct page **page = &pages[offset >> PAGE_SHIFT];
1094
1095         rbd_assert(end > offset);
1096         rbd_assert(end - offset <= (u64)SIZE_MAX);
1097         while (offset < end) {
1098                 size_t page_offset;
1099                 size_t length;
1100                 unsigned long flags;
1101                 void *kaddr;
1102
1103                 page_offset = (size_t)(offset & ~PAGE_MASK);
1104                 length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
1105                 local_irq_save(flags);
1106                 kaddr = kmap_atomic(*page);
1107                 memset(kaddr + page_offset, 0, length);
1108                 kunmap_atomic(kaddr);
1109                 local_irq_restore(flags);
1110
1111                 offset += length;
1112                 page++;
1113         }
1114 }
1115
1116 /*
1117  * Clone a portion of a bio, starting at the given byte offset
1118  * and continuing for the number of bytes indicated.
1119  */
1120 static struct bio *bio_clone_range(struct bio *bio_src,
1121                                         unsigned int offset,
1122                                         unsigned int len,
1123                                         gfp_t gfpmask)
1124 {
1125         struct bio_vec *bv;
1126         unsigned int resid;
1127         unsigned short idx;
1128         unsigned int voff;
1129         unsigned short end_idx;
1130         unsigned short vcnt;
1131         struct bio *bio;
1132
1133         /* Handle the easy case for the caller */
1134
1135         if (!offset && len == bio_src->bi_size)
1136                 return bio_clone(bio_src, gfpmask);
1137
1138         if (WARN_ON_ONCE(!len))
1139                 return NULL;
1140         if (WARN_ON_ONCE(len > bio_src->bi_size))
1141                 return NULL;
1142         if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1143                 return NULL;
1144
1145         /* Find first affected segment... */
1146
1147         resid = offset;
1148         __bio_for_each_segment(bv, bio_src, idx, 0) {
1149                 if (resid < bv->bv_len)
1150                         break;
1151                 resid -= bv->bv_len;
1152         }
1153         voff = resid;
1154
1155         /* ...and the last affected segment */
1156
1157         resid += len;
1158         __bio_for_each_segment(bv, bio_src, end_idx, idx) {
1159                 if (resid <= bv->bv_len)
1160                         break;
1161                 resid -= bv->bv_len;
1162         }
1163         vcnt = end_idx - idx + 1;
1164
1165         /* Build the clone */
1166
1167         bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1168         if (!bio)
1169                 return NULL;    /* ENOMEM */
1170
1171         bio->bi_bdev = bio_src->bi_bdev;
1172         bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1173         bio->bi_rw = bio_src->bi_rw;
1174         bio->bi_flags |= 1 << BIO_CLONED;
1175
1176         /*
1177          * Copy over our part of the bio_vec, then update the first
1178          * and last (or only) entries.
1179          */
1180         memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1181                         vcnt * sizeof (struct bio_vec));
1182         bio->bi_io_vec[0].bv_offset += voff;
1183         if (vcnt > 1) {
1184                 bio->bi_io_vec[0].bv_len -= voff;
1185                 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1186         } else {
1187                 bio->bi_io_vec[0].bv_len = len;
1188         }
1189
1190         bio->bi_vcnt = vcnt;
1191         bio->bi_size = len;
1192         bio->bi_idx = 0;
1193
1194         return bio;
1195 }
1196
1197 /*
1198  * Clone a portion of a bio chain, starting at the given byte offset
1199  * into the first bio in the source chain and continuing for the
1200  * number of bytes indicated.  The result is another bio chain of
1201  * exactly the given length, or a null pointer on error.
1202  *
1203  * The bio_src and offset parameters are both in-out.  On entry they
1204  * refer to the first source bio and the offset into that bio where
1205  * the start of data to be cloned is located.
1206  *
1207  * On return, bio_src is updated to refer to the bio in the source
1208  * chain that contains first un-cloned byte, and *offset will
1209  * contain the offset of that byte within that bio.
1210  */
1211 static struct bio *bio_chain_clone_range(struct bio **bio_src,
1212                                         unsigned int *offset,
1213                                         unsigned int len,
1214                                         gfp_t gfpmask)
1215 {
1216         struct bio *bi = *bio_src;
1217         unsigned int off = *offset;
1218         struct bio *chain = NULL;
1219         struct bio **end;
1220
1221         /* Build up a chain of clone bios up to the limit */
1222
1223         if (!bi || off >= bi->bi_size || !len)
1224                 return NULL;            /* Nothing to clone */
1225
1226         end = &chain;
1227         while (len) {
1228                 unsigned int bi_size;
1229                 struct bio *bio;
1230
1231                 if (!bi) {
1232                         rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1233                         goto out_err;   /* EINVAL; ran out of bio's */
1234                 }
1235                 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1236                 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1237                 if (!bio)
1238                         goto out_err;   /* ENOMEM */
1239
1240                 *end = bio;
1241                 end = &bio->bi_next;
1242
1243                 off += bi_size;
1244                 if (off == bi->bi_size) {
1245                         bi = bi->bi_next;
1246                         off = 0;
1247                 }
1248                 len -= bi_size;
1249         }
1250         *bio_src = bi;
1251         *offset = off;
1252
1253         return chain;
1254 out_err:
1255         bio_chain_put(chain);
1256
1257         return NULL;
1258 }
1259
1260 /*
1261  * The default/initial value for all object request flags is 0.  For
1262  * each flag, once its value is set to 1 it is never reset to 0
1263  * again.
1264  */
1265 static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1266 {
1267         if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
1268                 struct rbd_device *rbd_dev;
1269
1270                 rbd_dev = obj_request->img_request->rbd_dev;
1271                 rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1272                         obj_request);
1273         }
1274 }
1275
1276 static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1277 {
1278         smp_mb();
1279         return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1280 }
1281
1282 static void obj_request_done_set(struct rbd_obj_request *obj_request)
1283 {
1284         if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1285                 struct rbd_device *rbd_dev = NULL;
1286
1287                 if (obj_request_img_data_test(obj_request))
1288                         rbd_dev = obj_request->img_request->rbd_dev;
1289                 rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1290                         obj_request);
1291         }
1292 }
1293
1294 static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1295 {
1296         smp_mb();
1297         return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1298 }
1299
1300 /*
1301  * This sets the KNOWN flag after (possibly) setting the EXISTS
1302  * flag.  The latter is set based on the "exists" value provided.
1303  *
1304  * Note that for our purposes once an object exists it never goes
1305  * away again.  It's possible that the response from two existence
1306  * checks are separated by the creation of the target object, and
1307  * the first ("doesn't exist") response arrives *after* the second
1308  * ("does exist").  In that case we ignore the second one.
1309  */
1310 static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1311                                 bool exists)
1312 {
1313         if (exists)
1314                 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1315         set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1316         smp_mb();
1317 }
1318
1319 static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1320 {
1321         smp_mb();
1322         return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1323 }
1324
1325 static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1326 {
1327         smp_mb();
1328         return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1329 }
1330
1331 static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1332 {
1333         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1334                 atomic_read(&obj_request->kref.refcount));
1335         kref_get(&obj_request->kref);
1336 }
1337
1338 static void rbd_obj_request_destroy(struct kref *kref);
1339 static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1340 {
1341         rbd_assert(obj_request != NULL);
1342         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1343                 atomic_read(&obj_request->kref.refcount));
1344         kref_put(&obj_request->kref, rbd_obj_request_destroy);
1345 }
1346
1347 static void rbd_img_request_get(struct rbd_img_request *img_request)
1348 {
1349         dout("%s: img %p (was %d)\n", __func__, img_request,
1350                 atomic_read(&img_request->kref.refcount));
1351         kref_get(&img_request->kref);
1352 }
1353
1354 static void rbd_img_request_destroy(struct kref *kref);
1355 static void rbd_img_request_put(struct rbd_img_request *img_request)
1356 {
1357         rbd_assert(img_request != NULL);
1358         dout("%s: img %p (was %d)\n", __func__, img_request,
1359                 atomic_read(&img_request->kref.refcount));
1360         kref_put(&img_request->kref, rbd_img_request_destroy);
1361 }
1362
1363 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1364                                         struct rbd_obj_request *obj_request)
1365 {
1366         rbd_assert(obj_request->img_request == NULL);
1367
1368         /* Image request now owns object's original reference */
1369         obj_request->img_request = img_request;
1370         obj_request->which = img_request->obj_request_count;
1371         rbd_assert(!obj_request_img_data_test(obj_request));
1372         obj_request_img_data_set(obj_request);
1373         rbd_assert(obj_request->which != BAD_WHICH);
1374         img_request->obj_request_count++;
1375         list_add_tail(&obj_request->links, &img_request->obj_requests);
1376         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1377                 obj_request->which);
1378 }
1379
1380 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1381                                         struct rbd_obj_request *obj_request)
1382 {
1383         rbd_assert(obj_request->which != BAD_WHICH);
1384
1385         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1386                 obj_request->which);
1387         list_del(&obj_request->links);
1388         rbd_assert(img_request->obj_request_count > 0);
1389         img_request->obj_request_count--;
1390         rbd_assert(obj_request->which == img_request->obj_request_count);
1391         obj_request->which = BAD_WHICH;
1392         rbd_assert(obj_request_img_data_test(obj_request));
1393         rbd_assert(obj_request->img_request == img_request);
1394         obj_request->img_request = NULL;
1395         obj_request->callback = NULL;
1396         rbd_obj_request_put(obj_request);
1397 }
1398
1399 static bool obj_request_type_valid(enum obj_request_type type)
1400 {
1401         switch (type) {
1402         case OBJ_REQUEST_NODATA:
1403         case OBJ_REQUEST_BIO:
1404         case OBJ_REQUEST_PAGES:
1405                 return true;
1406         default:
1407                 return false;
1408         }
1409 }
1410
1411 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1412                                 struct rbd_obj_request *obj_request)
1413 {
1414         dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1415
1416         return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1417 }
1418
1419 static void rbd_img_request_complete(struct rbd_img_request *img_request)
1420 {
1421
1422         dout("%s: img %p\n", __func__, img_request);
1423
1424         /*
1425          * If no error occurred, compute the aggregate transfer
1426          * count for the image request.  We could instead use
1427          * atomic64_cmpxchg() to update it as each object request
1428          * completes; not clear which way is better off hand.
1429          */
1430         if (!img_request->result) {
1431                 struct rbd_obj_request *obj_request;
1432                 u64 xferred = 0;
1433
1434                 for_each_obj_request(img_request, obj_request)
1435                         xferred += obj_request->xferred;
1436                 img_request->xferred = xferred;
1437         }
1438
1439         if (img_request->callback)
1440                 img_request->callback(img_request);
1441         else
1442                 rbd_img_request_put(img_request);
1443 }
1444
1445 /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1446
1447 static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1448 {
1449         dout("%s: obj %p\n", __func__, obj_request);
1450
1451         return wait_for_completion_interruptible(&obj_request->completion);
1452 }
1453
1454 /*
1455  * The default/initial value for all image request flags is 0.  Each
1456  * is conditionally set to 1 at image request initialization time
1457  * and currently never change thereafter.
1458  */
1459 static void img_request_write_set(struct rbd_img_request *img_request)
1460 {
1461         set_bit(IMG_REQ_WRITE, &img_request->flags);
1462         smp_mb();
1463 }
1464
1465 static bool img_request_write_test(struct rbd_img_request *img_request)
1466 {
1467         smp_mb();
1468         return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1469 }
1470
1471 static void img_request_child_set(struct rbd_img_request *img_request)
1472 {
1473         set_bit(IMG_REQ_CHILD, &img_request->flags);
1474         smp_mb();
1475 }
1476
1477 static bool img_request_child_test(struct rbd_img_request *img_request)
1478 {
1479         smp_mb();
1480         return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1481 }
1482
1483 static void img_request_layered_set(struct rbd_img_request *img_request)
1484 {
1485         set_bit(IMG_REQ_LAYERED, &img_request->flags);
1486         smp_mb();
1487 }
1488
1489 static bool img_request_layered_test(struct rbd_img_request *img_request)
1490 {
1491         smp_mb();
1492         return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1493 }
1494
1495 static void
1496 rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1497 {
1498         u64 xferred = obj_request->xferred;
1499         u64 length = obj_request->length;
1500
1501         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1502                 obj_request, obj_request->img_request, obj_request->result,
1503                 xferred, length);
1504         /*
1505          * ENOENT means a hole in the image.  We zero-fill the
1506          * entire length of the request.  A short read also implies
1507          * zero-fill to the end of the request.  Either way we
1508          * update the xferred count to indicate the whole request
1509          * was satisfied.
1510          */
1511         rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
1512         if (obj_request->result == -ENOENT) {
1513                 if (obj_request->type == OBJ_REQUEST_BIO)
1514                         zero_bio_chain(obj_request->bio_list, 0);
1515                 else
1516                         zero_pages(obj_request->pages, 0, length);
1517                 obj_request->result = 0;
1518                 obj_request->xferred = length;
1519         } else if (xferred < length && !obj_request->result) {
1520                 if (obj_request->type == OBJ_REQUEST_BIO)
1521                         zero_bio_chain(obj_request->bio_list, xferred);
1522                 else
1523                         zero_pages(obj_request->pages, xferred, length);
1524                 obj_request->xferred = length;
1525         }
1526         obj_request_done_set(obj_request);
1527 }
1528
1529 static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1530 {
1531         dout("%s: obj %p cb %p\n", __func__, obj_request,
1532                 obj_request->callback);
1533         if (obj_request->callback)
1534                 obj_request->callback(obj_request);
1535         else
1536                 complete_all(&obj_request->completion);
1537 }
1538
1539 static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1540 {
1541         dout("%s: obj %p\n", __func__, obj_request);
1542         obj_request_done_set(obj_request);
1543 }
1544
1545 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1546 {
1547         struct rbd_img_request *img_request = NULL;
1548         struct rbd_device *rbd_dev = NULL;
1549         bool layered = false;
1550
1551         if (obj_request_img_data_test(obj_request)) {
1552                 img_request = obj_request->img_request;
1553                 layered = img_request && img_request_layered_test(img_request);
1554                 rbd_dev = img_request->rbd_dev;
1555         }
1556
1557         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1558                 obj_request, img_request, obj_request->result,
1559                 obj_request->xferred, obj_request->length);
1560         if (layered && obj_request->result == -ENOENT &&
1561                         obj_request->img_offset < rbd_dev->parent_overlap)
1562                 rbd_img_parent_read(obj_request);
1563         else if (img_request)
1564                 rbd_img_obj_request_read_callback(obj_request);
1565         else
1566                 obj_request_done_set(obj_request);
1567 }
1568
1569 static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1570 {
1571         dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1572                 obj_request->result, obj_request->length);
1573         /*
1574          * There is no such thing as a successful short write.  Set
1575          * it to our originally-requested length.
1576          */
1577         obj_request->xferred = obj_request->length;
1578         obj_request_done_set(obj_request);
1579 }
1580
1581 /*
1582  * For a simple stat call there's nothing to do.  We'll do more if
1583  * this is part of a write sequence for a layered image.
1584  */
1585 static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1586 {
1587         dout("%s: obj %p\n", __func__, obj_request);
1588         obj_request_done_set(obj_request);
1589 }
1590
1591 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1592                                 struct ceph_msg *msg)
1593 {
1594         struct rbd_obj_request *obj_request = osd_req->r_priv;
1595         u16 opcode;
1596
1597         dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1598         rbd_assert(osd_req == obj_request->osd_req);
1599         if (obj_request_img_data_test(obj_request)) {
1600                 rbd_assert(obj_request->img_request);
1601                 rbd_assert(obj_request->which != BAD_WHICH);
1602         } else {
1603                 rbd_assert(obj_request->which == BAD_WHICH);
1604         }
1605
1606         if (osd_req->r_result < 0)
1607                 obj_request->result = osd_req->r_result;
1608
1609         BUG_ON(osd_req->r_num_ops > 2);
1610
1611         /*
1612          * We support a 64-bit length, but ultimately it has to be
1613          * passed to blk_end_request(), which takes an unsigned int.
1614          */
1615         obj_request->xferred = osd_req->r_reply_op_len[0];
1616         rbd_assert(obj_request->xferred < (u64)UINT_MAX);
1617         opcode = osd_req->r_ops[0].op;
1618         switch (opcode) {
1619         case CEPH_OSD_OP_READ:
1620                 rbd_osd_read_callback(obj_request);
1621                 break;
1622         case CEPH_OSD_OP_WRITE:
1623                 rbd_osd_write_callback(obj_request);
1624                 break;
1625         case CEPH_OSD_OP_STAT:
1626                 rbd_osd_stat_callback(obj_request);
1627                 break;
1628         case CEPH_OSD_OP_CALL:
1629         case CEPH_OSD_OP_NOTIFY_ACK:
1630         case CEPH_OSD_OP_WATCH:
1631                 rbd_osd_trivial_callback(obj_request);
1632                 break;
1633         default:
1634                 rbd_warn(NULL, "%s: unsupported op %hu\n",
1635                         obj_request->object_name, (unsigned short) opcode);
1636                 break;
1637         }
1638
1639         if (obj_request_done_test(obj_request))
1640                 rbd_obj_request_complete(obj_request);
1641 }
1642
1643 static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1644 {
1645         struct rbd_img_request *img_request = obj_request->img_request;
1646         struct ceph_osd_request *osd_req = obj_request->osd_req;
1647         u64 snap_id;
1648
1649         rbd_assert(osd_req != NULL);
1650
1651         snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
1652         ceph_osdc_build_request(osd_req, obj_request->offset,
1653                         NULL, snap_id, NULL);
1654 }
1655
1656 static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1657 {
1658         struct rbd_img_request *img_request = obj_request->img_request;
1659         struct ceph_osd_request *osd_req = obj_request->osd_req;
1660         struct ceph_snap_context *snapc;
1661         struct timespec mtime = CURRENT_TIME;
1662
1663         rbd_assert(osd_req != NULL);
1664
1665         snapc = img_request ? img_request->snapc : NULL;
1666         ceph_osdc_build_request(osd_req, obj_request->offset,
1667                         snapc, CEPH_NOSNAP, &mtime);
1668 }
1669
1670 static struct ceph_osd_request *rbd_osd_req_create(
1671                                         struct rbd_device *rbd_dev,
1672                                         bool write_request,
1673                                         struct rbd_obj_request *obj_request)
1674 {
1675         struct ceph_snap_context *snapc = NULL;
1676         struct ceph_osd_client *osdc;
1677         struct ceph_osd_request *osd_req;
1678
1679         if (obj_request_img_data_test(obj_request)) {
1680                 struct rbd_img_request *img_request = obj_request->img_request;
1681
1682                 rbd_assert(write_request ==
1683                                 img_request_write_test(img_request));
1684                 if (write_request)
1685                         snapc = img_request->snapc;
1686         }
1687
1688         /* Allocate and initialize the request, for the single op */
1689
1690         osdc = &rbd_dev->rbd_client->client->osdc;
1691         osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1692         if (!osd_req)
1693                 return NULL;    /* ENOMEM */
1694
1695         if (write_request)
1696                 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1697         else
1698                 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1699
1700         osd_req->r_callback = rbd_osd_req_callback;
1701         osd_req->r_priv = obj_request;
1702
1703         osd_req->r_oid_len = strlen(obj_request->object_name);
1704         rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1705         memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1706
1707         osd_req->r_file_layout = rbd_dev->layout;       /* struct */
1708
1709         return osd_req;
1710 }
1711
1712 /*
1713  * Create a copyup osd request based on the information in the
1714  * object request supplied.  A copyup request has two osd ops,
1715  * a copyup method call, and a "normal" write request.
1716  */
1717 static struct ceph_osd_request *
1718 rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1719 {
1720         struct rbd_img_request *img_request;
1721         struct ceph_snap_context *snapc;
1722         struct rbd_device *rbd_dev;
1723         struct ceph_osd_client *osdc;
1724         struct ceph_osd_request *osd_req;
1725
1726         rbd_assert(obj_request_img_data_test(obj_request));
1727         img_request = obj_request->img_request;
1728         rbd_assert(img_request);
1729         rbd_assert(img_request_write_test(img_request));
1730
1731         /* Allocate and initialize the request, for the two ops */
1732
1733         snapc = img_request->snapc;
1734         rbd_dev = img_request->rbd_dev;
1735         osdc = &rbd_dev->rbd_client->client->osdc;
1736         osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
1737         if (!osd_req)
1738                 return NULL;    /* ENOMEM */
1739
1740         osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1741         osd_req->r_callback = rbd_osd_req_callback;
1742         osd_req->r_priv = obj_request;
1743
1744         osd_req->r_oid_len = strlen(obj_request->object_name);
1745         rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1746         memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1747
1748         osd_req->r_file_layout = rbd_dev->layout;       /* struct */
1749
1750         return osd_req;
1751 }
1752
1753
1754 static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1755 {
1756         ceph_osdc_put_request(osd_req);
1757 }
1758
1759 /* object_name is assumed to be a non-null pointer and NUL-terminated */
1760
1761 static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1762                                                 u64 offset, u64 length,
1763                                                 enum obj_request_type type)
1764 {
1765         struct rbd_obj_request *obj_request;
1766         size_t size;
1767         char *name;
1768
1769         rbd_assert(obj_request_type_valid(type));
1770
1771         size = strlen(object_name) + 1;
1772         name = kmalloc(size, GFP_KERNEL);
1773         if (!name)
1774                 return NULL;
1775
1776         obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
1777         if (!obj_request) {
1778                 kfree(name);
1779                 return NULL;
1780         }
1781
1782         obj_request->object_name = memcpy(name, object_name, size);
1783         obj_request->offset = offset;
1784         obj_request->length = length;
1785         obj_request->flags = 0;
1786         obj_request->which = BAD_WHICH;
1787         obj_request->type = type;
1788         INIT_LIST_HEAD(&obj_request->links);
1789         init_completion(&obj_request->completion);
1790         kref_init(&obj_request->kref);
1791
1792         dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1793                 offset, length, (int)type, obj_request);
1794
1795         return obj_request;
1796 }
1797
1798 static void rbd_obj_request_destroy(struct kref *kref)
1799 {
1800         struct rbd_obj_request *obj_request;
1801
1802         obj_request = container_of(kref, struct rbd_obj_request, kref);
1803
1804         dout("%s: obj %p\n", __func__, obj_request);
1805
1806         rbd_assert(obj_request->img_request == NULL);
1807         rbd_assert(obj_request->which == BAD_WHICH);
1808
1809         if (obj_request->osd_req)
1810                 rbd_osd_req_destroy(obj_request->osd_req);
1811
1812         rbd_assert(obj_request_type_valid(obj_request->type));
1813         switch (obj_request->type) {
1814         case OBJ_REQUEST_NODATA:
1815                 break;          /* Nothing to do */
1816         case OBJ_REQUEST_BIO:
1817                 if (obj_request->bio_list)
1818                         bio_chain_put(obj_request->bio_list);
1819                 break;
1820         case OBJ_REQUEST_PAGES:
1821                 if (obj_request->pages)
1822                         ceph_release_page_vector(obj_request->pages,
1823                                                 obj_request->page_count);
1824                 break;
1825         }
1826
1827         kfree(obj_request->object_name);
1828         obj_request->object_name = NULL;
1829         kmem_cache_free(rbd_obj_request_cache, obj_request);
1830 }
1831
1832 /*
1833  * Caller is responsible for filling in the list of object requests
1834  * that comprises the image request, and the Linux request pointer
1835  * (if there is one).
1836  */
1837 static struct rbd_img_request *rbd_img_request_create(
1838                                         struct rbd_device *rbd_dev,
1839                                         u64 offset, u64 length,
1840                                         bool write_request,
1841                                         bool child_request)
1842 {
1843         struct rbd_img_request *img_request;
1844
1845         img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
1846         if (!img_request)
1847                 return NULL;
1848
1849         if (write_request) {
1850                 down_read(&rbd_dev->header_rwsem);
1851                 ceph_get_snap_context(rbd_dev->header.snapc);
1852                 up_read(&rbd_dev->header_rwsem);
1853         }
1854
1855         img_request->rq = NULL;
1856         img_request->rbd_dev = rbd_dev;
1857         img_request->offset = offset;
1858         img_request->length = length;
1859         img_request->flags = 0;
1860         if (write_request) {
1861                 img_request_write_set(img_request);
1862                 img_request->snapc = rbd_dev->header.snapc;
1863         } else {
1864                 img_request->snap_id = rbd_dev->spec->snap_id;
1865         }
1866         if (child_request)
1867                 img_request_child_set(img_request);
1868         if (rbd_dev->parent_spec)
1869                 img_request_layered_set(img_request);
1870         spin_lock_init(&img_request->completion_lock);
1871         img_request->next_completion = 0;
1872         img_request->callback = NULL;
1873         img_request->result = 0;
1874         img_request->obj_request_count = 0;
1875         INIT_LIST_HEAD(&img_request->obj_requests);
1876         kref_init(&img_request->kref);
1877
1878         rbd_img_request_get(img_request);       /* Avoid a warning */
1879         rbd_img_request_put(img_request);       /* TEMPORARY */
1880
1881         dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1882                 write_request ? "write" : "read", offset, length,
1883                 img_request);
1884
1885         return img_request;
1886 }
1887
1888 static void rbd_img_request_destroy(struct kref *kref)
1889 {
1890         struct rbd_img_request *img_request;
1891         struct rbd_obj_request *obj_request;
1892         struct rbd_obj_request *next_obj_request;
1893
1894         img_request = container_of(kref, struct rbd_img_request, kref);
1895
1896         dout("%s: img %p\n", __func__, img_request);
1897
1898         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1899                 rbd_img_obj_request_del(img_request, obj_request);
1900         rbd_assert(img_request->obj_request_count == 0);
1901
1902         if (img_request_write_test(img_request))
1903                 ceph_put_snap_context(img_request->snapc);
1904
1905         if (img_request_child_test(img_request))
1906                 rbd_obj_request_put(img_request->obj_request);
1907
1908         kmem_cache_free(rbd_img_request_cache, img_request);
1909 }
1910
1911 static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
1912 {
1913         struct rbd_img_request *img_request;
1914         unsigned int xferred;
1915         int result;
1916         bool more;
1917
1918         rbd_assert(obj_request_img_data_test(obj_request));
1919         img_request = obj_request->img_request;
1920
1921         rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1922         xferred = (unsigned int)obj_request->xferred;
1923         result = obj_request->result;
1924         if (result) {
1925                 struct rbd_device *rbd_dev = img_request->rbd_dev;
1926
1927                 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
1928                         img_request_write_test(img_request) ? "write" : "read",
1929                         obj_request->length, obj_request->img_offset,
1930                         obj_request->offset);
1931                 rbd_warn(rbd_dev, "  result %d xferred %x\n",
1932                         result, xferred);
1933                 if (!img_request->result)
1934                         img_request->result = result;
1935         }
1936
1937         /* Image object requests don't own their page array */
1938
1939         if (obj_request->type == OBJ_REQUEST_PAGES) {
1940                 obj_request->pages = NULL;
1941                 obj_request->page_count = 0;
1942         }
1943
1944         if (img_request_child_test(img_request)) {
1945                 rbd_assert(img_request->obj_request != NULL);
1946                 more = obj_request->which < img_request->obj_request_count - 1;
1947         } else {
1948                 rbd_assert(img_request->rq != NULL);
1949                 more = blk_end_request(img_request->rq, result, xferred);
1950         }
1951
1952         return more;
1953 }
1954
1955 static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1956 {
1957         struct rbd_img_request *img_request;
1958         u32 which = obj_request->which;
1959         bool more = true;
1960
1961         rbd_assert(obj_request_img_data_test(obj_request));
1962         img_request = obj_request->img_request;
1963
1964         dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1965         rbd_assert(img_request != NULL);
1966         rbd_assert(img_request->obj_request_count > 0);
1967         rbd_assert(which != BAD_WHICH);
1968         rbd_assert(which < img_request->obj_request_count);
1969         rbd_assert(which >= img_request->next_completion);
1970
1971         spin_lock_irq(&img_request->completion_lock);
1972         if (which != img_request->next_completion)
1973                 goto out;
1974
1975         for_each_obj_request_from(img_request, obj_request) {
1976                 rbd_assert(more);
1977                 rbd_assert(which < img_request->obj_request_count);
1978
1979                 if (!obj_request_done_test(obj_request))
1980                         break;
1981                 more = rbd_img_obj_end_request(obj_request);
1982                 which++;
1983         }
1984
1985         rbd_assert(more ^ (which == img_request->obj_request_count));
1986         img_request->next_completion = which;
1987 out:
1988         spin_unlock_irq(&img_request->completion_lock);
1989
1990         if (!more)
1991                 rbd_img_request_complete(img_request);
1992 }
1993
1994 /*
1995  * Split up an image request into one or more object requests, each
1996  * to a different object.  The "type" parameter indicates whether
1997  * "data_desc" is the pointer to the head of a list of bio
1998  * structures, or the base of a page array.  In either case this
1999  * function assumes data_desc describes memory sufficient to hold
2000  * all data described by the image request.
2001  */
2002 static int rbd_img_request_fill(struct rbd_img_request *img_request,
2003                                         enum obj_request_type type,
2004                                         void *data_desc)
2005 {
2006         struct rbd_device *rbd_dev = img_request->rbd_dev;
2007         struct rbd_obj_request *obj_request = NULL;
2008         struct rbd_obj_request *next_obj_request;
2009         bool write_request = img_request_write_test(img_request);
2010         struct bio *bio_list;
2011         unsigned int bio_offset = 0;
2012         struct page **pages;
2013         u64 img_offset;
2014         u64 resid;
2015         u16 opcode;
2016
2017         dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2018                 (int)type, data_desc);
2019
2020         opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
2021         img_offset = img_request->offset;
2022         resid = img_request->length;
2023         rbd_assert(resid > 0);
2024
2025         if (type == OBJ_REQUEST_BIO) {
2026                 bio_list = data_desc;
2027                 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
2028         } else {
2029                 rbd_assert(type == OBJ_REQUEST_PAGES);
2030                 pages = data_desc;
2031         }
2032
2033         while (resid) {
2034                 struct ceph_osd_request *osd_req;
2035                 const char *object_name;
2036                 u64 offset;
2037                 u64 length;
2038
2039                 object_name = rbd_segment_name(rbd_dev, img_offset);
2040                 if (!object_name)
2041                         goto out_unwind;
2042                 offset = rbd_segment_offset(rbd_dev, img_offset);
2043                 length = rbd_segment_length(rbd_dev, img_offset, resid);
2044                 obj_request = rbd_obj_request_create(object_name,
2045                                                 offset, length, type);
2046                 /* object request has its own copy of the object name */
2047                 rbd_segment_name_free(object_name);
2048                 if (!obj_request)
2049                         goto out_unwind;
2050
2051                 if (type == OBJ_REQUEST_BIO) {
2052                         unsigned int clone_size;
2053
2054                         rbd_assert(length <= (u64)UINT_MAX);
2055                         clone_size = (unsigned int)length;
2056                         obj_request->bio_list =
2057                                         bio_chain_clone_range(&bio_list,
2058                                                                 &bio_offset,
2059                                                                 clone_size,
2060                                                                 GFP_ATOMIC);
2061                         if (!obj_request->bio_list)
2062                                 goto out_partial;
2063                 } else {
2064                         unsigned int page_count;
2065
2066                         obj_request->pages = pages;
2067                         page_count = (u32)calc_pages_for(offset, length);
2068                         obj_request->page_count = page_count;
2069                         if ((offset + length) & ~PAGE_MASK)
2070                                 page_count--;   /* more on last page */
2071                         pages += page_count;
2072                 }
2073
2074                 osd_req = rbd_osd_req_create(rbd_dev, write_request,
2075                                                 obj_request);
2076                 if (!osd_req)
2077                         goto out_partial;
2078                 obj_request->osd_req = osd_req;
2079                 obj_request->callback = rbd_img_obj_callback;
2080
2081                 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
2082                                                 0, 0);
2083                 if (type == OBJ_REQUEST_BIO)
2084                         osd_req_op_extent_osd_data_bio(osd_req, 0,
2085                                         obj_request->bio_list, length);
2086                 else
2087                         osd_req_op_extent_osd_data_pages(osd_req, 0,
2088                                         obj_request->pages, length,
2089                                         offset & ~PAGE_MASK, false, false);
2090
2091                 if (write_request)
2092                         rbd_osd_req_format_write(obj_request);
2093                 else
2094                         rbd_osd_req_format_read(obj_request);
2095
2096                 obj_request->img_offset = img_offset;
2097                 rbd_img_obj_request_add(img_request, obj_request);
2098
2099                 img_offset += length;
2100                 resid -= length;
2101         }
2102
2103         return 0;
2104
2105 out_partial:
2106         rbd_obj_request_put(obj_request);
2107 out_unwind:
2108         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2109                 rbd_obj_request_put(obj_request);
2110
2111         return -ENOMEM;
2112 }
2113
2114 static void
2115 rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
2116 {
2117         struct rbd_img_request *img_request;
2118         struct rbd_device *rbd_dev;
2119         u64 length;
2120         u32 page_count;
2121
2122         rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2123         rbd_assert(obj_request_img_data_test(obj_request));
2124         img_request = obj_request->img_request;
2125         rbd_assert(img_request);
2126
2127         rbd_dev = img_request->rbd_dev;
2128         rbd_assert(rbd_dev);
2129         length = (u64)1 << rbd_dev->header.obj_order;
2130         page_count = (u32)calc_pages_for(0, length);
2131
2132         rbd_assert(obj_request->copyup_pages);
2133         ceph_release_page_vector(obj_request->copyup_pages, page_count);
2134         obj_request->copyup_pages = NULL;
2135
2136         /*
2137          * We want the transfer count to reflect the size of the
2138          * original write request.  There is no such thing as a
2139          * successful short write, so if the request was successful
2140          * we can just set it to the originally-requested length.
2141          */
2142         if (!obj_request->result)
2143                 obj_request->xferred = obj_request->length;
2144
2145         /* Finish up with the normal image object callback */
2146
2147         rbd_img_obj_callback(obj_request);
2148 }
2149
2150 static void
2151 rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2152 {
2153         struct rbd_obj_request *orig_request;
2154         struct ceph_osd_request *osd_req;
2155         struct ceph_osd_client *osdc;
2156         struct rbd_device *rbd_dev;
2157         struct page **pages;
2158         int result;
2159         u64 obj_size;
2160         u64 xferred;
2161
2162         rbd_assert(img_request_child_test(img_request));
2163
2164         /* First get what we need from the image request */
2165
2166         pages = img_request->copyup_pages;
2167         rbd_assert(pages != NULL);
2168         img_request->copyup_pages = NULL;
2169
2170         orig_request = img_request->obj_request;
2171         rbd_assert(orig_request != NULL);
2172         rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
2173         result = img_request->result;
2174         obj_size = img_request->length;
2175         xferred = img_request->xferred;
2176
2177         rbd_dev = img_request->rbd_dev;
2178         rbd_assert(rbd_dev);
2179         rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
2180
2181         rbd_img_request_put(img_request);
2182
2183         if (result)
2184                 goto out_err;
2185
2186         /* Allocate the new copyup osd request for the original request */
2187
2188         result = -ENOMEM;
2189         rbd_assert(!orig_request->osd_req);
2190         osd_req = rbd_osd_req_create_copyup(orig_request);
2191         if (!osd_req)
2192                 goto out_err;
2193         orig_request->osd_req = osd_req;
2194         orig_request->copyup_pages = pages;
2195
2196         /* Initialize the copyup op */
2197
2198         osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2199         osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
2200                                                 false, false);
2201
2202         /* Then the original write request op */
2203
2204         osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2205                                         orig_request->offset,
2206                                         orig_request->length, 0, 0);
2207         osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
2208                                         orig_request->length);
2209
2210         rbd_osd_req_format_write(orig_request);
2211
2212         /* All set, send it off. */
2213
2214         orig_request->callback = rbd_img_obj_copyup_callback;
2215         osdc = &rbd_dev->rbd_client->client->osdc;
2216         result = rbd_obj_request_submit(osdc, orig_request);
2217         if (!result)
2218                 return;
2219 out_err:
2220         /* Record the error code and complete the request */
2221
2222         orig_request->result = result;
2223         orig_request->xferred = 0;
2224         obj_request_done_set(orig_request);
2225         rbd_obj_request_complete(orig_request);
2226 }
2227
2228 /*
2229  * Read from the parent image the range of data that covers the
2230  * entire target of the given object request.  This is used for
2231  * satisfying a layered image write request when the target of an
2232  * object request from the image request does not exist.
2233  *
2234  * A page array big enough to hold the returned data is allocated
2235  * and supplied to rbd_img_request_fill() as the "data descriptor."
2236  * When the read completes, this page array will be transferred to
2237  * the original object request for the copyup operation.
2238  *
2239  * If an error occurs, record it as the result of the original
2240  * object request and mark it done so it gets completed.
2241  */
2242 static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2243 {
2244         struct rbd_img_request *img_request = NULL;
2245         struct rbd_img_request *parent_request = NULL;
2246         struct rbd_device *rbd_dev;
2247         u64 img_offset;
2248         u64 length;
2249         struct page **pages = NULL;
2250         u32 page_count;
2251         int result;
2252
2253         rbd_assert(obj_request_img_data_test(obj_request));
2254         rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2255
2256         img_request = obj_request->img_request;
2257         rbd_assert(img_request != NULL);
2258         rbd_dev = img_request->rbd_dev;
2259         rbd_assert(rbd_dev->parent != NULL);
2260
2261         /*
2262          * First things first.  The original osd request is of no
2263          * use to use any more, we'll need a new one that can hold
2264          * the two ops in a copyup request.  We'll get that later,
2265          * but for now we can release the old one.
2266          */
2267         rbd_osd_req_destroy(obj_request->osd_req);
2268         obj_request->osd_req = NULL;
2269
2270         /*
2271          * Determine the byte range covered by the object in the
2272          * child image to which the original request was to be sent.
2273          */
2274         img_offset = obj_request->img_offset - obj_request->offset;
2275         length = (u64)1 << rbd_dev->header.obj_order;
2276
2277         /*
2278          * There is no defined parent data beyond the parent
2279          * overlap, so limit what we read at that boundary if
2280          * necessary.
2281          */
2282         if (img_offset + length > rbd_dev->parent_overlap) {
2283                 rbd_assert(img_offset < rbd_dev->parent_overlap);
2284                 length = rbd_dev->parent_overlap - img_offset;
2285         }
2286
2287         /*
2288          * Allocate a page array big enough to receive the data read
2289          * from the parent.
2290          */
2291         page_count = (u32)calc_pages_for(0, length);
2292         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2293         if (IS_ERR(pages)) {
2294                 result = PTR_ERR(pages);
2295                 pages = NULL;
2296                 goto out_err;
2297         }
2298
2299         result = -ENOMEM;
2300         parent_request = rbd_img_request_create(rbd_dev->parent,
2301                                                 img_offset, length,
2302                                                 false, true);
2303         if (!parent_request)
2304                 goto out_err;
2305         rbd_obj_request_get(obj_request);
2306         parent_request->obj_request = obj_request;
2307
2308         result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2309         if (result)
2310                 goto out_err;
2311         parent_request->copyup_pages = pages;
2312
2313         parent_request->callback = rbd_img_obj_parent_read_full_callback;
2314         result = rbd_img_request_submit(parent_request);
2315         if (!result)
2316                 return 0;
2317
2318         parent_request->copyup_pages = NULL;
2319         parent_request->obj_request = NULL;
2320         rbd_obj_request_put(obj_request);
2321 out_err:
2322         if (pages)
2323                 ceph_release_page_vector(pages, page_count);
2324         if (parent_request)
2325                 rbd_img_request_put(parent_request);
2326         obj_request->result = result;
2327         obj_request->xferred = 0;
2328         obj_request_done_set(obj_request);
2329
2330         return result;
2331 }
2332
2333 static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2334 {
2335         struct rbd_obj_request *orig_request;
2336         int result;
2337
2338         rbd_assert(!obj_request_img_data_test(obj_request));
2339
2340         /*
2341          * All we need from the object request is the original
2342          * request and the result of the STAT op.  Grab those, then
2343          * we're done with the request.
2344          */
2345         orig_request = obj_request->obj_request;
2346         obj_request->obj_request = NULL;
2347         rbd_assert(orig_request);
2348         rbd_assert(orig_request->img_request);
2349
2350         result = obj_request->result;
2351         obj_request->result = 0;
2352
2353         dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2354                 obj_request, orig_request, result,
2355                 obj_request->xferred, obj_request->length);
2356         rbd_obj_request_put(obj_request);
2357
2358         rbd_assert(orig_request);
2359         rbd_assert(orig_request->img_request);
2360
2361         /*
2362          * Our only purpose here is to determine whether the object
2363          * exists, and we don't want to treat the non-existence as
2364          * an error.  If something else comes back, transfer the
2365          * error to the original request and complete it now.
2366          */
2367         if (!result) {
2368                 obj_request_existence_set(orig_request, true);
2369         } else if (result == -ENOENT) {
2370                 obj_request_existence_set(orig_request, false);
2371         } else if (result) {
2372                 orig_request->result = result;
2373                 goto out;
2374         }
2375
2376         /*
2377          * Resubmit the original request now that we have recorded
2378          * whether the target object exists.
2379          */
2380         orig_request->result = rbd_img_obj_request_submit(orig_request);
2381 out:
2382         if (orig_request->result)
2383                 rbd_obj_request_complete(orig_request);
2384         rbd_obj_request_put(orig_request);
2385 }
2386
2387 static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2388 {
2389         struct rbd_obj_request *stat_request;
2390         struct rbd_device *rbd_dev;
2391         struct ceph_osd_client *osdc;
2392         struct page **pages = NULL;
2393         u32 page_count;
2394         size_t size;
2395         int ret;
2396
2397         /*
2398          * The response data for a STAT call consists of:
2399          *     le64 length;
2400          *     struct {
2401          *         le32 tv_sec;
2402          *         le32 tv_nsec;
2403          *     } mtime;
2404          */
2405         size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2406         page_count = (u32)calc_pages_for(0, size);
2407         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2408         if (IS_ERR(pages))
2409                 return PTR_ERR(pages);
2410
2411         ret = -ENOMEM;
2412         stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2413                                                         OBJ_REQUEST_PAGES);
2414         if (!stat_request)
2415                 goto out;
2416
2417         rbd_obj_request_get(obj_request);
2418         stat_request->obj_request = obj_request;
2419         stat_request->pages = pages;
2420         stat_request->page_count = page_count;
2421
2422         rbd_assert(obj_request->img_request);
2423         rbd_dev = obj_request->img_request->rbd_dev;
2424         stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2425                                                 stat_request);
2426         if (!stat_request->osd_req)
2427                 goto out;
2428         stat_request->callback = rbd_img_obj_exists_callback;
2429
2430         osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2431         osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2432                                         false, false);
2433         rbd_osd_req_format_read(stat_request);
2434
2435         osdc = &rbd_dev->rbd_client->client->osdc;
2436         ret = rbd_obj_request_submit(osdc, stat_request);
2437 out:
2438         if (ret)
2439                 rbd_obj_request_put(obj_request);
2440
2441         return ret;
2442 }
2443
2444 static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2445 {
2446         struct rbd_img_request *img_request;
2447         struct rbd_device *rbd_dev;
2448         bool known;
2449
2450         rbd_assert(obj_request_img_data_test(obj_request));
2451
2452         img_request = obj_request->img_request;
2453         rbd_assert(img_request);
2454         rbd_dev = img_request->rbd_dev;
2455
2456         /*
2457          * Only writes to layered images need special handling.
2458          * Reads and non-layered writes are simple object requests.
2459          * Layered writes that start beyond the end of the overlap
2460          * with the parent have no parent data, so they too are
2461          * simple object requests.  Finally, if the target object is
2462          * known to already exist, its parent data has already been
2463          * copied, so a write to the object can also be handled as a
2464          * simple object request.
2465          */
2466         if (!img_request_write_test(img_request) ||
2467                 !img_request_layered_test(img_request) ||
2468                 rbd_dev->parent_overlap <= obj_request->img_offset ||
2469                 ((known = obj_request_known_test(obj_request)) &&
2470                         obj_request_exists_test(obj_request))) {
2471
2472                 struct rbd_device *rbd_dev;
2473                 struct ceph_osd_client *osdc;
2474
2475                 rbd_dev = obj_request->img_request->rbd_dev;
2476                 osdc = &rbd_dev->rbd_client->client->osdc;
2477
2478                 return rbd_obj_request_submit(osdc, obj_request);
2479         }
2480
2481         /*
2482          * It's a layered write.  The target object might exist but
2483          * we may not know that yet.  If we know it doesn't exist,
2484          * start by reading the data for the full target object from
2485          * the parent so we can use it for a copyup to the target.
2486          */
2487         if (known)
2488                 return rbd_img_obj_parent_read_full(obj_request);
2489
2490         /* We don't know whether the target exists.  Go find out. */
2491
2492         return rbd_img_obj_exists_submit(obj_request);
2493 }
2494
2495 static int rbd_img_request_submit(struct rbd_img_request *img_request)
2496 {
2497         struct rbd_obj_request *obj_request;
2498         struct rbd_obj_request *next_obj_request;
2499
2500         dout("%s: img %p\n", __func__, img_request);
2501         for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2502                 int ret;
2503
2504                 ret = rbd_img_obj_request_submit(obj_request);
2505                 if (ret)
2506                         return ret;
2507         }
2508
2509         return 0;
2510 }
2511
2512 static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2513 {
2514         struct rbd_obj_request *obj_request;
2515         struct rbd_device *rbd_dev;
2516         u64 obj_end;
2517
2518         rbd_assert(img_request_child_test(img_request));
2519
2520         obj_request = img_request->obj_request;
2521         rbd_assert(obj_request);
2522         rbd_assert(obj_request->img_request);
2523
2524         obj_request->result = img_request->result;
2525         if (obj_request->result)
2526                 goto out;
2527
2528         /*
2529          * We need to zero anything beyond the parent overlap
2530          * boundary.  Since rbd_img_obj_request_read_callback()
2531          * will zero anything beyond the end of a short read, an
2532          * easy way to do this is to pretend the data from the
2533          * parent came up short--ending at the overlap boundary.
2534          */
2535         rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2536         obj_end = obj_request->img_offset + obj_request->length;
2537         rbd_dev = obj_request->img_request->rbd_dev;
2538         if (obj_end > rbd_dev->parent_overlap) {
2539                 u64 xferred = 0;
2540
2541                 if (obj_request->img_offset < rbd_dev->parent_overlap)
2542                         xferred = rbd_dev->parent_overlap -
2543                                         obj_request->img_offset;
2544
2545                 obj_request->xferred = min(img_request->xferred, xferred);
2546         } else {
2547                 obj_request->xferred = img_request->xferred;
2548         }
2549 out:
2550         rbd_img_request_put(img_request);
2551         rbd_img_obj_request_read_callback(obj_request);
2552         rbd_obj_request_complete(obj_request);
2553 }
2554
2555 static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2556 {
2557         struct rbd_device *rbd_dev;
2558         struct rbd_img_request *img_request;
2559         int result;
2560
2561         rbd_assert(obj_request_img_data_test(obj_request));
2562         rbd_assert(obj_request->img_request != NULL);
2563         rbd_assert(obj_request->result == (s32) -ENOENT);
2564         rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2565
2566         rbd_dev = obj_request->img_request->rbd_dev;
2567         rbd_assert(rbd_dev->parent != NULL);
2568         /* rbd_read_finish(obj_request, obj_request->length); */
2569         img_request = rbd_img_request_create(rbd_dev->parent,
2570                                                 obj_request->img_offset,
2571                                                 obj_request->length,
2572                                                 false, true);
2573         result = -ENOMEM;
2574         if (!img_request)
2575                 goto out_err;
2576
2577         rbd_obj_request_get(obj_request);
2578         img_request->obj_request = obj_request;
2579
2580         result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2581                                         obj_request->bio_list);
2582         if (result)
2583                 goto out_err;
2584
2585         img_request->callback = rbd_img_parent_read_callback;
2586         result = rbd_img_request_submit(img_request);
2587         if (result)
2588                 goto out_err;
2589
2590         return;
2591 out_err:
2592         if (img_request)
2593                 rbd_img_request_put(img_request);
2594         obj_request->result = result;
2595         obj_request->xferred = 0;
2596         obj_request_done_set(obj_request);
2597 }
2598
2599 static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id)
2600 {
2601         struct rbd_obj_request *obj_request;
2602         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2603         int ret;
2604
2605         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2606                                                         OBJ_REQUEST_NODATA);
2607         if (!obj_request)
2608                 return -ENOMEM;
2609
2610         ret = -ENOMEM;
2611         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2612         if (!obj_request->osd_req)
2613                 goto out;
2614         obj_request->callback = rbd_obj_request_put;
2615
2616         osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2617                                         notify_id, 0, 0);
2618         rbd_osd_req_format_read(obj_request);
2619
2620         ret = rbd_obj_request_submit(osdc, obj_request);
2621 out:
2622         if (ret)
2623                 rbd_obj_request_put(obj_request);
2624
2625         return ret;
2626 }
2627
2628 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2629 {
2630         struct rbd_device *rbd_dev = (struct rbd_device *)data;
2631         int ret;
2632
2633         if (!rbd_dev)
2634                 return;
2635
2636         dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2637                 rbd_dev->header_name, (unsigned long long)notify_id,
2638                 (unsigned int)opcode);
2639         ret = rbd_dev_refresh(rbd_dev);
2640         if (ret)
2641                 rbd_warn(rbd_dev, ": header refresh error (%d)\n", ret);
2642
2643         rbd_obj_notify_ack(rbd_dev, notify_id);
2644 }
2645
2646 /*
2647  * Request sync osd watch/unwatch.  The value of "start" determines
2648  * whether a watch request is being initiated or torn down.
2649  */
2650 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
2651 {
2652         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2653         struct rbd_obj_request *obj_request;
2654         int ret;
2655
2656         rbd_assert(start ^ !!rbd_dev->watch_event);
2657         rbd_assert(start ^ !!rbd_dev->watch_request);
2658
2659         if (start) {
2660                 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
2661                                                 &rbd_dev->watch_event);
2662                 if (ret < 0)
2663                         return ret;
2664                 rbd_assert(rbd_dev->watch_event != NULL);
2665         }
2666
2667         ret = -ENOMEM;
2668         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2669                                                         OBJ_REQUEST_NODATA);
2670         if (!obj_request)
2671                 goto out_cancel;
2672
2673         obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2674         if (!obj_request->osd_req)
2675                 goto out_cancel;
2676
2677         if (start)
2678                 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
2679         else
2680                 ceph_osdc_unregister_linger_request(osdc,
2681                                         rbd_dev->watch_request->osd_req);
2682
2683         osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2684                                 rbd_dev->watch_event->cookie, 0, start);
2685         rbd_osd_req_format_write(obj_request);
2686
2687         ret = rbd_obj_request_submit(osdc, obj_request);
2688         if (ret)
2689                 goto out_cancel;
2690         ret = rbd_obj_request_wait(obj_request);
2691         if (ret)
2692                 goto out_cancel;
2693         ret = obj_request->result;
2694         if (ret)
2695                 goto out_cancel;
2696
2697         /*
2698          * A watch request is set to linger, so the underlying osd
2699          * request won't go away until we unregister it.  We retain
2700          * a pointer to the object request during that time (in
2701          * rbd_dev->watch_request), so we'll keep a reference to
2702          * it.  We'll drop that reference (below) after we've
2703          * unregistered it.
2704          */
2705         if (start) {
2706                 rbd_dev->watch_request = obj_request;
2707
2708                 return 0;
2709         }
2710
2711         /* We have successfully torn down the watch request */
2712
2713         rbd_obj_request_put(rbd_dev->watch_request);
2714         rbd_dev->watch_request = NULL;
2715 out_cancel:
2716         /* Cancel the event if we're tearing down, or on error */
2717         ceph_osdc_cancel_event(rbd_dev->watch_event);
2718         rbd_dev->watch_event = NULL;
2719         if (obj_request)
2720                 rbd_obj_request_put(obj_request);
2721
2722         return ret;
2723 }
2724
2725 /*
2726  * Synchronous osd object method call.  Returns the number of bytes
2727  * returned in the outbound buffer, or a negative error code.
2728  */
2729 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
2730                              const char *object_name,
2731                              const char *class_name,
2732                              const char *method_name,
2733                              const void *outbound,
2734                              size_t outbound_size,
2735                              void *inbound,
2736                              size_t inbound_size)
2737 {
2738         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2739         struct rbd_obj_request *obj_request;
2740         struct page **pages;
2741         u32 page_count;
2742         int ret;
2743
2744         /*
2745          * Method calls are ultimately read operations.  The result
2746          * should placed into the inbound buffer provided.  They
2747          * also supply outbound data--parameters for the object
2748          * method.  Currently if this is present it will be a
2749          * snapshot id.
2750          */
2751         page_count = (u32)calc_pages_for(0, inbound_size);
2752         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2753         if (IS_ERR(pages))
2754                 return PTR_ERR(pages);
2755
2756         ret = -ENOMEM;
2757         obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
2758                                                         OBJ_REQUEST_PAGES);
2759         if (!obj_request)
2760                 goto out;
2761
2762         obj_request->pages = pages;
2763         obj_request->page_count = page_count;
2764
2765         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2766         if (!obj_request->osd_req)
2767                 goto out;
2768
2769         osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
2770                                         class_name, method_name);
2771         if (outbound_size) {
2772                 struct ceph_pagelist *pagelist;
2773
2774                 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
2775                 if (!pagelist)
2776                         goto out;
2777
2778                 ceph_pagelist_init(pagelist);
2779                 ceph_pagelist_append(pagelist, outbound, outbound_size);
2780                 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
2781                                                 pagelist);
2782         }
2783         osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2784                                         obj_request->pages, inbound_size,
2785                                         0, false, false);
2786         rbd_osd_req_format_read(obj_request);
2787
2788         ret = rbd_obj_request_submit(osdc, obj_request);
2789         if (ret)
2790                 goto out;
2791         ret = rbd_obj_request_wait(obj_request);
2792         if (ret)
2793                 goto out;
2794
2795         ret = obj_request->result;
2796         if (ret < 0)
2797                 goto out;
2798
2799         rbd_assert(obj_request->xferred < (u64)INT_MAX);
2800         ret = (int)obj_request->xferred;
2801         ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
2802 out:
2803         if (obj_request)
2804                 rbd_obj_request_put(obj_request);
2805         else
2806                 ceph_release_page_vector(pages, page_count);
2807
2808         return ret;
2809 }
2810
2811 static void rbd_request_fn(struct request_queue *q)
2812                 __releases(q->queue_lock) __acquires(q->queue_lock)
2813 {
2814         struct rbd_device *rbd_dev = q->queuedata;
2815         bool read_only = rbd_dev->mapping.read_only;
2816         struct request *rq;
2817         int result;
2818
2819         while ((rq = blk_fetch_request(q))) {
2820                 bool write_request = rq_data_dir(rq) == WRITE;
2821                 struct rbd_img_request *img_request;
2822                 u64 offset;
2823                 u64 length;
2824
2825                 /* Ignore any non-FS requests that filter through. */
2826
2827                 if (rq->cmd_type != REQ_TYPE_FS) {
2828                         dout("%s: non-fs request type %d\n", __func__,
2829                                 (int) rq->cmd_type);
2830                         __blk_end_request_all(rq, 0);
2831                         continue;
2832                 }
2833
2834                 /* Ignore/skip any zero-length requests */
2835
2836                 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2837                 length = (u64) blk_rq_bytes(rq);
2838
2839                 if (!length) {
2840                         dout("%s: zero-length request\n", __func__);
2841                         __blk_end_request_all(rq, 0);
2842                         continue;
2843                 }
2844
2845                 spin_unlock_irq(q->queue_lock);
2846
2847                 /* Disallow writes to a read-only device */
2848
2849                 if (write_request) {
2850                         result = -EROFS;
2851                         if (read_only)
2852                                 goto end_request;
2853                         rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2854                 }
2855
2856                 /*
2857                  * Quit early if the mapped snapshot no longer
2858                  * exists.  It's still possible the snapshot will
2859                  * have disappeared by the time our request arrives
2860                  * at the osd, but there's no sense in sending it if
2861                  * we already know.
2862                  */
2863                 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2864                         dout("request for non-existent snapshot");
2865                         rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2866                         result = -ENXIO;
2867                         goto end_request;
2868                 }
2869
2870                 result = -EINVAL;
2871                 if (offset && length > U64_MAX - offset + 1) {
2872                         rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
2873                                 offset, length);
2874                         goto end_request;       /* Shouldn't happen */
2875                 }
2876
2877                 result = -EIO;
2878                 if (offset + length > rbd_dev->mapping.size) {
2879                         rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
2880                                 offset, length, rbd_dev->mapping.size);
2881                         goto end_request;
2882                 }
2883
2884                 result = -ENOMEM;
2885                 img_request = rbd_img_request_create(rbd_dev, offset, length,
2886                                                         write_request, false);
2887                 if (!img_request)
2888                         goto end_request;
2889
2890                 img_request->rq = rq;
2891
2892                 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2893                                                 rq->bio);
2894                 if (!result)
2895                         result = rbd_img_request_submit(img_request);
2896                 if (result)
2897                         rbd_img_request_put(img_request);
2898 end_request:
2899                 spin_lock_irq(q->queue_lock);
2900                 if (result < 0) {
2901                         rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
2902                                 write_request ? "write" : "read",
2903                                 length, offset, result);
2904
2905                         __blk_end_request_all(rq, result);
2906                 }
2907         }
2908 }
2909
2910 /*
2911  * a queue callback. Makes sure that we don't create a bio that spans across
2912  * multiple osd objects. One exception would be with a single page bios,
2913  * which we handle later at bio_chain_clone_range()
2914  */
2915 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2916                           struct bio_vec *bvec)
2917 {
2918         struct rbd_device *rbd_dev = q->queuedata;
2919         sector_t sector_offset;
2920         sector_t sectors_per_obj;
2921         sector_t obj_sector_offset;
2922         int ret;
2923
2924         /*
2925          * Find how far into its rbd object the partition-relative
2926          * bio start sector is to offset relative to the enclosing
2927          * device.
2928          */
2929         sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2930         sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2931         obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2932
2933         /*
2934          * Compute the number of bytes from that offset to the end
2935          * of the object.  Account for what's already used by the bio.
2936          */
2937         ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2938         if (ret > bmd->bi_size)
2939                 ret -= bmd->bi_size;
2940         else
2941                 ret = 0;
2942
2943         /*
2944          * Don't send back more than was asked for.  And if the bio
2945          * was empty, let the whole thing through because:  "Note
2946          * that a block device *must* allow a single page to be
2947          * added to an empty bio."
2948          */
2949         rbd_assert(bvec->bv_len <= PAGE_SIZE);
2950         if (ret > (int) bvec->bv_len || !bmd->bi_size)
2951                 ret = (int) bvec->bv_len;
2952
2953         return ret;
2954 }
2955
2956 static void rbd_free_disk(struct rbd_device *rbd_dev)
2957 {
2958         struct gendisk *disk = rbd_dev->disk;
2959
2960         if (!disk)
2961                 return;
2962
2963         rbd_dev->disk = NULL;
2964         if (disk->flags & GENHD_FL_UP) {
2965                 del_gendisk(disk);
2966                 if (disk->queue)
2967                         blk_cleanup_queue(disk->queue);
2968         }
2969         put_disk(disk);
2970 }
2971
2972 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2973                                 const char *object_name,
2974                                 u64 offset, u64 length, void *buf)
2975
2976 {
2977         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2978         struct rbd_obj_request *obj_request;
2979         struct page **pages = NULL;
2980         u32 page_count;
2981         size_t size;
2982         int ret;
2983
2984         page_count = (u32) calc_pages_for(offset, length);
2985         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2986         if (IS_ERR(pages))
2987                 ret = PTR_ERR(pages);
2988
2989         ret = -ENOMEM;
2990         obj_request = rbd_obj_request_create(object_name, offset, length,
2991                                                         OBJ_REQUEST_PAGES);
2992         if (!obj_request)
2993                 goto out;
2994
2995         obj_request->pages = pages;
2996         obj_request->page_count = page_count;
2997
2998         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2999         if (!obj_request->osd_req)
3000                 goto out;
3001
3002         osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3003                                         offset, length, 0, 0);
3004         osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
3005                                         obj_request->pages,
3006                                         obj_request->length,
3007                                         obj_request->offset & ~PAGE_MASK,
3008                                         false, false);
3009         rbd_osd_req_format_read(obj_request);
3010
3011         ret = rbd_obj_request_submit(osdc, obj_request);
3012         if (ret)
3013                 goto out;
3014         ret = rbd_obj_request_wait(obj_request);
3015         if (ret)
3016                 goto out;
3017
3018         ret = obj_request->result;
3019         if (ret < 0)
3020                 goto out;
3021
3022         rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
3023         size = (size_t) obj_request->xferred;
3024         ceph_copy_from_page_vector(pages, buf, 0, size);
3025         rbd_assert(size <= (size_t)INT_MAX);
3026         ret = (int)size;
3027 out:
3028         if (obj_request)
3029                 rbd_obj_request_put(obj_request);
3030         else
3031                 ceph_release_page_vector(pages, page_count);
3032
3033         return ret;
3034 }
3035
3036 /*
3037  * Read the complete header for the given rbd device.
3038  *
3039  * Returns a pointer to a dynamically-allocated buffer containing
3040  * the complete and validated header.  Caller can pass the address
3041  * of a variable that will be filled in with the version of the
3042  * header object at the time it was read.
3043  *
3044  * Returns a pointer-coded errno if a failure occurs.
3045  */
3046 static struct rbd_image_header_ondisk *
3047 rbd_dev_v1_header_read(struct rbd_device *rbd_dev)
3048 {
3049         struct rbd_image_header_ondisk *ondisk = NULL;
3050         u32 snap_count = 0;
3051         u64 names_size = 0;
3052         u32 want_count;
3053         int ret;
3054
3055         /*
3056          * The complete header will include an array of its 64-bit
3057          * snapshot ids, followed by the names of those snapshots as
3058          * a contiguous block of NUL-terminated strings.  Note that
3059          * the number of snapshots could change by the time we read
3060          * it in, in which case we re-read it.
3061          */
3062         do {
3063                 size_t size;
3064
3065                 kfree(ondisk);
3066
3067                 size = sizeof (*ondisk);
3068                 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3069                 size += names_size;
3070                 ondisk = kmalloc(size, GFP_KERNEL);
3071                 if (!ondisk)
3072                         return ERR_PTR(-ENOMEM);
3073
3074                 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
3075                                        0, size, ondisk);
3076                 if (ret < 0)
3077                         goto out_err;
3078                 if ((size_t)ret < size) {
3079                         ret = -ENXIO;
3080                         rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3081                                 size, ret);
3082                         goto out_err;
3083                 }
3084                 if (!rbd_dev_ondisk_valid(ondisk)) {
3085                         ret = -ENXIO;
3086                         rbd_warn(rbd_dev, "invalid header");
3087                         goto out_err;
3088                 }
3089
3090                 names_size = le64_to_cpu(ondisk->snap_names_len);
3091                 want_count = snap_count;
3092                 snap_count = le32_to_cpu(ondisk->snap_count);
3093         } while (snap_count != want_count);
3094
3095         return ondisk;
3096
3097 out_err:
3098         kfree(ondisk);
3099
3100         return ERR_PTR(ret);
3101 }
3102
3103 /*
3104  * reload the ondisk the header
3105  */
3106 static int rbd_read_header(struct rbd_device *rbd_dev,
3107                            struct rbd_image_header *header)
3108 {
3109         struct rbd_image_header_ondisk *ondisk;
3110         int ret;
3111
3112         ondisk = rbd_dev_v1_header_read(rbd_dev);
3113         if (IS_ERR(ondisk))
3114                 return PTR_ERR(ondisk);
3115         ret = rbd_header_from_disk(header, ondisk);
3116         kfree(ondisk);
3117
3118         return ret;
3119 }
3120
3121 /*
3122  * only read the first part of the ondisk header, without the snaps info
3123  */
3124 static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev)
3125 {
3126         int ret;
3127         struct rbd_image_header h;
3128
3129         ret = rbd_read_header(rbd_dev, &h);
3130         if (ret < 0)
3131                 return ret;
3132
3133         down_write(&rbd_dev->header_rwsem);
3134
3135         /* Update image size, and check for resize of mapped image */
3136         rbd_dev->header.image_size = h.image_size;
3137         if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
3138                 if (rbd_dev->mapping.size != rbd_dev->header.image_size)
3139                         rbd_dev->mapping.size = rbd_dev->header.image_size;
3140
3141         /* rbd_dev->header.object_prefix shouldn't change */
3142         kfree(rbd_dev->header.snap_sizes);
3143         kfree(rbd_dev->header.snap_names);
3144         /* osd requests may still refer to snapc */
3145         ceph_put_snap_context(rbd_dev->header.snapc);
3146
3147         rbd_dev->header.image_size = h.image_size;
3148         rbd_dev->header.snapc = h.snapc;
3149         rbd_dev->header.snap_names = h.snap_names;
3150         rbd_dev->header.snap_sizes = h.snap_sizes;
3151         /* Free the extra copy of the object prefix */
3152         if (strcmp(rbd_dev->header.object_prefix, h.object_prefix))
3153                 rbd_warn(rbd_dev, "object prefix changed (ignoring)");
3154         kfree(h.object_prefix);
3155
3156         up_write(&rbd_dev->header_rwsem);
3157
3158         return ret;
3159 }
3160
3161 /*
3162  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3163  * has disappeared from the (just updated) snapshot context.
3164  */
3165 static void rbd_exists_validate(struct rbd_device *rbd_dev)
3166 {
3167         u64 snap_id;
3168
3169         if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3170                 return;
3171
3172         snap_id = rbd_dev->spec->snap_id;
3173         if (snap_id == CEPH_NOSNAP)
3174                 return;
3175
3176         if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3177                 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3178 }
3179
3180 static int rbd_dev_refresh(struct rbd_device *rbd_dev)
3181 {
3182         u64 mapping_size;
3183         int ret;
3184
3185         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3186         mapping_size = rbd_dev->mapping.size;
3187         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3188         if (rbd_dev->image_format == 1)
3189                 ret = rbd_dev_v1_refresh(rbd_dev);
3190         else
3191                 ret = rbd_dev_v2_refresh(rbd_dev);
3192
3193         /* If it's a mapped snapshot, validate its EXISTS flag */
3194
3195         rbd_exists_validate(rbd_dev);
3196         mutex_unlock(&ctl_mutex);
3197         if (mapping_size != rbd_dev->mapping.size) {
3198                 sector_t size;
3199
3200                 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3201                 dout("setting size to %llu sectors", (unsigned long long)size);
3202                 set_capacity(rbd_dev->disk, size);
3203                 revalidate_disk(rbd_dev->disk);
3204         }
3205
3206         return ret;
3207 }
3208
3209 static int rbd_init_disk(struct rbd_device *rbd_dev)
3210 {
3211         struct gendisk *disk;
3212         struct request_queue *q;
3213         u64 segment_size;
3214
3215         /* create gendisk info */
3216         disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3217         if (!disk)
3218                 return -ENOMEM;
3219
3220         snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3221                  rbd_dev->dev_id);
3222         disk->major = rbd_dev->major;
3223         disk->first_minor = 0;
3224         disk->fops = &rbd_bd_ops;
3225         disk->private_data = rbd_dev;
3226
3227         q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3228         if (!q)
3229                 goto out_disk;
3230
3231         /* We use the default size, but let's be explicit about it. */
3232         blk_queue_physical_block_size(q, SECTOR_SIZE);
3233
3234         /* set io sizes to object size */
3235         segment_size = rbd_obj_bytes(&rbd_dev->header);
3236         blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3237         blk_queue_max_segment_size(q, segment_size);
3238         blk_queue_io_min(q, segment_size);
3239         blk_queue_io_opt(q, segment_size);
3240
3241         blk_queue_merge_bvec(q, rbd_merge_bvec);
3242         disk->queue = q;
3243
3244         q->queuedata = rbd_dev;
3245
3246         rbd_dev->disk = disk;
3247
3248         return 0;
3249 out_disk:
3250         put_disk(disk);
3251
3252         return -ENOMEM;
3253 }
3254
3255 /*
3256   sysfs
3257 */
3258
3259 static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3260 {
3261         return container_of(dev, struct rbd_device, dev);
3262 }
3263
3264 static ssize_t rbd_size_show(struct device *dev,
3265                              struct device_attribute *attr, char *buf)
3266 {
3267         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3268
3269         return sprintf(buf, "%llu\n",
3270                 (unsigned long long)rbd_dev->mapping.size);
3271 }
3272
3273 /*
3274  * Note this shows the features for whatever's mapped, which is not
3275  * necessarily the base image.
3276  */
3277 static ssize_t rbd_features_show(struct device *dev,
3278                              struct device_attribute *attr, char *buf)
3279 {
3280         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3281
3282         return sprintf(buf, "0x%016llx\n",
3283                         (unsigned long long)rbd_dev->mapping.features);
3284 }
3285
3286 static ssize_t rbd_major_show(struct device *dev,
3287                               struct device_attribute *attr, char *buf)
3288 {
3289         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3290
3291         if (rbd_dev->major)
3292                 return sprintf(buf, "%d\n", rbd_dev->major);
3293
3294         return sprintf(buf, "(none)\n");
3295
3296 }
3297
3298 static ssize_t rbd_client_id_show(struct device *dev,
3299                                   struct device_attribute *attr, char *buf)
3300 {
3301         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3302
3303         return sprintf(buf, "client%lld\n",
3304                         ceph_client_id(rbd_dev->rbd_client->client));
3305 }
3306
3307 static ssize_t rbd_pool_show(struct device *dev,
3308                              struct device_attribute *attr, char *buf)
3309 {
3310         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3311
3312         return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3313 }
3314
3315 static ssize_t rbd_pool_id_show(struct device *dev,
3316                              struct device_attribute *attr, char *buf)
3317 {
3318         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3319
3320         return sprintf(buf, "%llu\n",
3321                         (unsigned long long) rbd_dev->spec->pool_id);
3322 }
3323
3324 static ssize_t rbd_name_show(struct device *dev,
3325                              struct device_attribute *attr, char *buf)
3326 {
3327         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3328
3329         if (rbd_dev->spec->image_name)
3330                 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3331
3332         return sprintf(buf, "(unknown)\n");
3333 }
3334
3335 static ssize_t rbd_image_id_show(struct device *dev,
3336                              struct device_attribute *attr, char *buf)
3337 {
3338         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3339
3340         return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3341 }
3342
3343 /*
3344  * Shows the name of the currently-mapped snapshot (or
3345  * RBD_SNAP_HEAD_NAME for the base image).
3346  */
3347 static ssize_t rbd_snap_show(struct device *dev,
3348                              struct device_attribute *attr,
3349                              char *buf)
3350 {
3351         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3352
3353         return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3354 }
3355
3356 /*
3357  * For an rbd v2 image, shows the pool id, image id, and snapshot id
3358  * for the parent image.  If there is no parent, simply shows
3359  * "(no parent image)".
3360  */
3361 static ssize_t rbd_parent_show(struct device *dev,
3362                              struct device_attribute *attr,
3363                              char *buf)
3364 {
3365         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3366         struct rbd_spec *spec = rbd_dev->parent_spec;
3367         int count;
3368         char *bufp = buf;
3369
3370         if (!spec)
3371                 return sprintf(buf, "(no parent image)\n");
3372
3373         count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
3374                         (unsigned long long) spec->pool_id, spec->pool_name);
3375         if (count < 0)
3376                 return count;
3377         bufp += count;
3378
3379         count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
3380                         spec->image_name ? spec->image_name : "(unknown)");
3381         if (count < 0)
3382                 return count;
3383         bufp += count;
3384
3385         count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
3386                         (unsigned long long) spec->snap_id, spec->snap_name);
3387         if (count < 0)
3388                 return count;
3389         bufp += count;
3390
3391         count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
3392         if (count < 0)
3393                 return count;
3394         bufp += count;
3395
3396         return (ssize_t) (bufp - buf);
3397 }
3398
3399 static ssize_t rbd_image_refresh(struct device *dev,
3400                                  struct device_attribute *attr,
3401                                  const char *buf,
3402                                  size_t size)
3403 {
3404         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3405         int ret;
3406
3407         ret = rbd_dev_refresh(rbd_dev);
3408         if (ret)
3409                 rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret);
3410
3411         return ret < 0 ? ret : size;
3412 }
3413
3414 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
3415 static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3416 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3417 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3418 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
3419 static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3420 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3421 static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3422 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3423 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
3424 static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3425
3426 static struct attribute *rbd_attrs[] = {
3427         &dev_attr_size.attr,
3428         &dev_attr_features.attr,
3429         &dev_attr_major.attr,
3430         &dev_attr_client_id.attr,
3431         &dev_attr_pool.attr,
3432         &dev_attr_pool_id.attr,
3433         &dev_attr_name.attr,
3434         &dev_attr_image_id.attr,
3435         &dev_attr_current_snap.attr,
3436         &dev_attr_parent.attr,
3437         &dev_attr_refresh.attr,
3438         NULL
3439 };
3440
3441 static struct attribute_group rbd_attr_group = {
3442         .attrs = rbd_attrs,
3443 };
3444
3445 static const struct attribute_group *rbd_attr_groups[] = {
3446         &rbd_attr_group,
3447         NULL
3448 };
3449
3450 static void rbd_sysfs_dev_release(struct device *dev)
3451 {
3452 }
3453
3454 static struct device_type rbd_device_type = {
3455         .name           = "rbd",
3456         .groups         = rbd_attr_groups,
3457         .release        = rbd_sysfs_dev_release,
3458 };
3459
3460 static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
3461 {
3462         kref_get(&spec->kref);
3463
3464         return spec;
3465 }
3466
3467 static void rbd_spec_free(struct kref *kref);
3468 static void rbd_spec_put(struct rbd_spec *spec)
3469 {
3470         if (spec)
3471                 kref_put(&spec->kref, rbd_spec_free);
3472 }
3473
3474 static struct rbd_spec *rbd_spec_alloc(void)
3475 {
3476         struct rbd_spec *spec;
3477
3478         spec = kzalloc(sizeof (*spec), GFP_KERNEL);
3479         if (!spec)
3480                 return NULL;
3481         kref_init(&spec->kref);
3482
3483         return spec;
3484 }
3485
3486 static void rbd_spec_free(struct kref *kref)
3487 {
3488         struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
3489
3490         kfree(spec->pool_name);
3491         kfree(spec->image_id);
3492         kfree(spec->image_name);
3493         kfree(spec->snap_name);
3494         kfree(spec);
3495 }
3496
3497 static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3498                                 struct rbd_spec *spec)
3499 {
3500         struct rbd_device *rbd_dev;
3501
3502         rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3503         if (!rbd_dev)
3504                 return NULL;
3505
3506         spin_lock_init(&rbd_dev->lock);
3507         rbd_dev->flags = 0;
3508         INIT_LIST_HEAD(&rbd_dev->node);
3509         init_rwsem(&rbd_dev->header_rwsem);
3510
3511         rbd_dev->spec = spec;
3512         rbd_dev->rbd_client = rbdc;
3513
3514         /* Initialize the layout used for all rbd requests */
3515
3516         rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3517         rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
3518         rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3519         rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
3520
3521         return rbd_dev;
3522 }
3523
3524 static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3525 {
3526         rbd_put_client(rbd_dev->rbd_client);
3527         rbd_spec_put(rbd_dev->spec);
3528         kfree(rbd_dev);
3529 }
3530
3531 /*
3532  * Get the size and object order for an image snapshot, or if
3533  * snap_id is CEPH_NOSNAP, gets this information for the base
3534  * image.
3535  */
3536 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
3537                                 u8 *order, u64 *snap_size)
3538 {
3539         __le64 snapid = cpu_to_le64(snap_id);
3540         int ret;
3541         struct {
3542                 u8 order;
3543                 __le64 size;
3544         } __attribute__ ((packed)) size_buf = { 0 };
3545
3546         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3547                                 "rbd", "get_size",
3548                                 &snapid, sizeof (snapid),
3549                                 &size_buf, sizeof (size_buf));
3550         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3551         if (ret < 0)
3552                 return ret;
3553         if (ret < sizeof (size_buf))
3554                 return -ERANGE;
3555
3556         if (order)
3557                 *order = size_buf.order;
3558         *snap_size = le64_to_cpu(size_buf.size);
3559
3560         dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
3561                 (unsigned long long)snap_id, (unsigned int)*order,
3562                 (unsigned long long)*snap_size);
3563
3564         return 0;
3565 }
3566
3567 static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
3568 {
3569         return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
3570                                         &rbd_dev->header.obj_order,
3571                                         &rbd_dev->header.image_size);
3572 }
3573
3574 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3575 {
3576         void *reply_buf;
3577         int ret;
3578         void *p;
3579
3580         reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
3581         if (!reply_buf)
3582                 return -ENOMEM;
3583
3584         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3585                                 "rbd", "get_object_prefix", NULL, 0,
3586                                 reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
3587         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3588         if (ret < 0)
3589                 goto out;
3590
3591         p = reply_buf;
3592         rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
3593                                                 p + ret, NULL, GFP_NOIO);
3594         ret = 0;
3595
3596         if (IS_ERR(rbd_dev->header.object_prefix)) {
3597                 ret = PTR_ERR(rbd_dev->header.object_prefix);
3598                 rbd_dev->header.object_prefix = NULL;
3599         } else {
3600                 dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
3601         }
3602 out:
3603         kfree(reply_buf);
3604
3605         return ret;
3606 }
3607
3608 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3609                 u64 *snap_features)
3610 {
3611         __le64 snapid = cpu_to_le64(snap_id);
3612         struct {
3613                 __le64 features;
3614                 __le64 incompat;
3615         } __attribute__ ((packed)) features_buf = { 0 };
3616         u64 incompat;
3617         int ret;
3618
3619         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3620                                 "rbd", "get_features",
3621                                 &snapid, sizeof (snapid),
3622                                 &features_buf, sizeof (features_buf));
3623         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3624         if (ret < 0)
3625                 return ret;
3626         if (ret < sizeof (features_buf))
3627                 return -ERANGE;
3628
3629         incompat = le64_to_cpu(features_buf.incompat);
3630         if (incompat & ~RBD_FEATURES_SUPPORTED)
3631                 return -ENXIO;
3632
3633         *snap_features = le64_to_cpu(features_buf.features);
3634
3635         dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3636                 (unsigned long long)snap_id,
3637                 (unsigned long long)*snap_features,
3638                 (unsigned long long)le64_to_cpu(features_buf.incompat));
3639
3640         return 0;
3641 }
3642
3643 static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3644 {
3645         return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3646                                                 &rbd_dev->header.features);
3647 }
3648
3649 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3650 {
3651         struct rbd_spec *parent_spec;
3652         size_t size;
3653         void *reply_buf = NULL;
3654         __le64 snapid;
3655         void *p;
3656         void *end;
3657         char *image_id;
3658         u64 overlap;
3659         int ret;
3660
3661         parent_spec = rbd_spec_alloc();
3662         if (!parent_spec)
3663                 return -ENOMEM;
3664
3665         size = sizeof (__le64) +                                /* pool_id */
3666                 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +        /* image_id */
3667                 sizeof (__le64) +                               /* snap_id */
3668                 sizeof (__le64);                                /* overlap */
3669         reply_buf = kmalloc(size, GFP_KERNEL);
3670         if (!reply_buf) {
3671                 ret = -ENOMEM;
3672                 goto out_err;
3673         }
3674
3675         snapid = cpu_to_le64(CEPH_NOSNAP);
3676         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3677                                 "rbd", "get_parent",
3678                                 &snapid, sizeof (snapid),
3679                                 reply_buf, size);
3680         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3681         if (ret < 0)
3682                 goto out_err;
3683
3684         p = reply_buf;
3685         end = reply_buf + ret;
3686         ret = -ERANGE;
3687         ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
3688         if (parent_spec->pool_id == CEPH_NOPOOL)
3689                 goto out;       /* No parent?  No problem. */
3690
3691         /* The ceph file layout needs to fit pool id in 32 bits */
3692
3693         ret = -EIO;
3694         if (parent_spec->pool_id > (u64)U32_MAX) {
3695                 rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3696                         (unsigned long long)parent_spec->pool_id, U32_MAX);
3697                 goto out_err;
3698         }
3699
3700         image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3701         if (IS_ERR(image_id)) {
3702                 ret = PTR_ERR(image_id);
3703                 goto out_err;
3704         }
3705         parent_spec->image_id = image_id;
3706         ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3707         ceph_decode_64_safe(&p, end, overlap, out_err);
3708
3709         rbd_dev->parent_overlap = overlap;
3710         rbd_dev->parent_spec = parent_spec;
3711         parent_spec = NULL;     /* rbd_dev now owns this */
3712 out:
3713         ret = 0;
3714 out_err:
3715         kfree(reply_buf);
3716         rbd_spec_put(parent_spec);
3717
3718         return ret;
3719 }
3720
3721 static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3722 {
3723         struct {
3724                 __le64 stripe_unit;
3725                 __le64 stripe_count;
3726         } __attribute__ ((packed)) striping_info_buf = { 0 };
3727         size_t size = sizeof (striping_info_buf);
3728         void *p;
3729         u64 obj_size;
3730         u64 stripe_unit;
3731         u64 stripe_count;
3732         int ret;
3733
3734         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3735                                 "rbd", "get_stripe_unit_count", NULL, 0,
3736                                 (char *)&striping_info_buf, size);
3737         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3738         if (ret < 0)
3739                 return ret;
3740         if (ret < size)
3741                 return -ERANGE;
3742
3743         /*
3744          * We don't actually support the "fancy striping" feature
3745          * (STRIPINGV2) yet, but if the striping sizes are the
3746          * defaults the behavior is the same as before.  So find
3747          * out, and only fail if the image has non-default values.
3748          */
3749         ret = -EINVAL;
3750         obj_size = (u64)1 << rbd_dev->header.obj_order;
3751         p = &striping_info_buf;
3752         stripe_unit = ceph_decode_64(&p);
3753         if (stripe_unit != obj_size) {
3754                 rbd_warn(rbd_dev, "unsupported stripe unit "
3755                                 "(got %llu want %llu)",
3756                                 stripe_unit, obj_size);
3757                 return -EINVAL;
3758         }
3759         stripe_count = ceph_decode_64(&p);
3760         if (stripe_count != 1) {
3761                 rbd_warn(rbd_dev, "unsupported stripe count "
3762                                 "(got %llu want 1)", stripe_count);
3763                 return -EINVAL;
3764         }
3765         rbd_dev->header.stripe_unit = stripe_unit;
3766         rbd_dev->header.stripe_count = stripe_count;
3767
3768         return 0;
3769 }
3770
3771 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3772 {
3773         size_t image_id_size;
3774         char *image_id;
3775         void *p;
3776         void *end;
3777         size_t size;
3778         void *reply_buf = NULL;
3779         size_t len = 0;
3780         char *image_name = NULL;
3781         int ret;
3782
3783         rbd_assert(!rbd_dev->spec->image_name);
3784
3785         len = strlen(rbd_dev->spec->image_id);
3786         image_id_size = sizeof (__le32) + len;
3787         image_id = kmalloc(image_id_size, GFP_KERNEL);
3788         if (!image_id)
3789                 return NULL;
3790
3791         p = image_id;
3792         end = image_id + image_id_size;
3793         ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
3794
3795         size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3796         reply_buf = kmalloc(size, GFP_KERNEL);
3797         if (!reply_buf)
3798                 goto out;
3799
3800         ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
3801                                 "rbd", "dir_get_name",
3802                                 image_id, image_id_size,
3803                                 reply_buf, size);
3804         if (ret < 0)
3805                 goto out;
3806         p = reply_buf;
3807         end = reply_buf + ret;
3808
3809         image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3810         if (IS_ERR(image_name))
3811                 image_name = NULL;
3812         else
3813                 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3814 out:
3815         kfree(reply_buf);
3816         kfree(image_id);
3817
3818         return image_name;
3819 }
3820
3821 static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
3822 {
3823         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3824         const char *snap_name;
3825         u32 which = 0;
3826
3827         /* Skip over names until we find the one we are looking for */
3828
3829         snap_name = rbd_dev->header.snap_names;
3830         while (which < snapc->num_snaps) {
3831                 if (!strcmp(name, snap_name))
3832                         return snapc->snaps[which];
3833                 snap_name += strlen(snap_name) + 1;
3834                 which++;
3835         }
3836         return CEPH_NOSNAP;
3837 }
3838
3839 static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
3840 {
3841         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3842         u32 which;
3843         bool found = false;
3844         u64 snap_id;
3845
3846         for (which = 0; !found && which < snapc->num_snaps; which++) {
3847                 const char *snap_name;
3848
3849                 snap_id = snapc->snaps[which];
3850                 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
3851                 if (IS_ERR(snap_name))
3852                         break;
3853                 found = !strcmp(name, snap_name);
3854                 kfree(snap_name);
3855         }
3856         return found ? snap_id : CEPH_NOSNAP;
3857 }
3858
3859 /*
3860  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
3861  * no snapshot by that name is found, or if an error occurs.
3862  */
3863 static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
3864 {
3865         if (rbd_dev->image_format == 1)
3866                 return rbd_v1_snap_id_by_name(rbd_dev, name);
3867
3868         return rbd_v2_snap_id_by_name(rbd_dev, name);
3869 }
3870
3871 /*
3872  * When an rbd image has a parent image, it is identified by the
3873  * pool, image, and snapshot ids (not names).  This function fills
3874  * in the names for those ids.  (It's OK if we can't figure out the
3875  * name for an image id, but the pool and snapshot ids should always
3876  * exist and have names.)  All names in an rbd spec are dynamically
3877  * allocated.
3878  *
3879  * When an image being mapped (not a parent) is probed, we have the
3880  * pool name and pool id, image name and image id, and the snapshot
3881  * name.  The only thing we're missing is the snapshot id.
3882  */
3883 static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
3884 {
3885         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3886         struct rbd_spec *spec = rbd_dev->spec;
3887         const char *pool_name;
3888         const char *image_name;
3889         const char *snap_name;
3890         int ret;
3891
3892         /*
3893          * An image being mapped will have the pool name (etc.), but
3894          * we need to look up the snapshot id.
3895          */
3896         if (spec->pool_name) {
3897                 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
3898                         u64 snap_id;
3899
3900                         snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
3901                         if (snap_id == CEPH_NOSNAP)
3902                                 return -ENOENT;
3903                         spec->snap_id = snap_id;
3904                 } else {
3905                         spec->snap_id = CEPH_NOSNAP;
3906                 }
3907
3908                 return 0;
3909         }
3910
3911         /* Get the pool name; we have to make our own copy of this */
3912
3913         pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
3914         if (!pool_name) {
3915                 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
3916                 return -EIO;
3917         }
3918         pool_name = kstrdup(pool_name, GFP_KERNEL);
3919         if (!pool_name)
3920                 return -ENOMEM;
3921
3922         /* Fetch the image name; tolerate failure here */
3923
3924         image_name = rbd_dev_image_name(rbd_dev);
3925         if (!image_name)
3926                 rbd_warn(rbd_dev, "unable to get image name");
3927
3928         /* Look up the snapshot name, and make a copy */
3929
3930         snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
3931         if (!snap_name) {
3932                 ret = -ENOMEM;
3933                 goto out_err;
3934         }
3935
3936         spec->pool_name = pool_name;
3937         spec->image_name = image_name;
3938         spec->snap_name = snap_name;
3939
3940         return 0;
3941 out_err:
3942         kfree(image_name);
3943         kfree(pool_name);
3944
3945         return ret;
3946 }
3947
3948 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
3949 {
3950         size_t size;
3951         int ret;
3952         void *reply_buf;
3953         void *p;
3954         void *end;
3955         u64 seq;
3956         u32 snap_count;
3957         struct ceph_snap_context *snapc;
3958         u32 i;
3959
3960         /*
3961          * We'll need room for the seq value (maximum snapshot id),
3962          * snapshot count, and array of that many snapshot ids.
3963          * For now we have a fixed upper limit on the number we're
3964          * prepared to receive.
3965          */
3966         size = sizeof (__le64) + sizeof (__le32) +
3967                         RBD_MAX_SNAP_COUNT * sizeof (__le64);
3968         reply_buf = kzalloc(size, GFP_KERNEL);
3969         if (!reply_buf)
3970                 return -ENOMEM;
3971
3972         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3973                                 "rbd", "get_snapcontext", NULL, 0,
3974                                 reply_buf, size);
3975         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3976         if (ret < 0)
3977                 goto out;
3978
3979         p = reply_buf;
3980         end = reply_buf + ret;
3981         ret = -ERANGE;
3982         ceph_decode_64_safe(&p, end, seq, out);
3983         ceph_decode_32_safe(&p, end, snap_count, out);
3984
3985         /*
3986          * Make sure the reported number of snapshot ids wouldn't go
3987          * beyond the end of our buffer.  But before checking that,
3988          * make sure the computed size of the snapshot context we
3989          * allocate is representable in a size_t.
3990          */
3991         if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3992                                  / sizeof (u64)) {
3993                 ret = -EINVAL;
3994                 goto out;
3995         }
3996         if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3997                 goto out;
3998         ret = 0;
3999
4000         snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
4001         if (!snapc) {
4002                 ret = -ENOMEM;
4003                 goto out;
4004         }
4005         snapc->seq = seq;
4006         for (i = 0; i < snap_count; i++)
4007                 snapc->snaps[i] = ceph_decode_64(&p);
4008
4009         ceph_put_snap_context(rbd_dev->header.snapc);
4010         rbd_dev->header.snapc = snapc;
4011
4012         dout("  snap context seq = %llu, snap_count = %u\n",
4013                 (unsigned long long)seq, (unsigned int)snap_count);
4014 out:
4015         kfree(reply_buf);
4016
4017         return ret;
4018 }
4019
4020 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4021                                         u64 snap_id)
4022 {
4023         size_t size;
4024         void *reply_buf;
4025         __le64 snapid;
4026         int ret;
4027         void *p;
4028         void *end;
4029         char *snap_name;
4030
4031         size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4032         reply_buf = kmalloc(size, GFP_KERNEL);
4033         if (!reply_buf)
4034                 return ERR_PTR(-ENOMEM);
4035
4036         snapid = cpu_to_le64(snap_id);
4037         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4038                                 "rbd", "get_snapshot_name",
4039                                 &snapid, sizeof (snapid),
4040                                 reply_buf, size);
4041         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4042         if (ret < 0) {
4043                 snap_name = ERR_PTR(ret);
4044                 goto out;
4045         }
4046
4047         p = reply_buf;
4048         end = reply_buf + ret;
4049         snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4050         if (IS_ERR(snap_name))
4051                 goto out;
4052
4053         dout("  snap_id 0x%016llx snap_name = %s\n",
4054                 (unsigned long long)snap_id, snap_name);
4055 out:
4056         kfree(reply_buf);
4057
4058         return snap_name;
4059 }
4060
4061 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev)
4062 {
4063         int ret;
4064
4065         down_write(&rbd_dev->header_rwsem);
4066
4067         ret = rbd_dev_v2_image_size(rbd_dev);
4068         if (ret)
4069                 goto out;
4070         if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
4071                 if (rbd_dev->mapping.size != rbd_dev->header.image_size)
4072                         rbd_dev->mapping.size = rbd_dev->header.image_size;
4073
4074         ret = rbd_dev_v2_snap_context(rbd_dev);
4075         dout("rbd_dev_v2_snap_context returned %d\n", ret);
4076         if (ret)
4077                 goto out;
4078 out:
4079         up_write(&rbd_dev->header_rwsem);
4080
4081         return ret;
4082 }
4083
4084 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4085 {
4086         struct device *dev;
4087         int ret;
4088
4089         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4090
4091         dev = &rbd_dev->dev;
4092         dev->bus = &rbd_bus_type;
4093         dev->type = &rbd_device_type;
4094         dev->parent = &rbd_root_dev;
4095         dev->release = rbd_dev_device_release;
4096         dev_set_name(dev, "%d", rbd_dev->dev_id);
4097         ret = device_register(dev);
4098
4099         mutex_unlock(&ctl_mutex);
4100
4101         return ret;
4102 }
4103
4104 static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4105 {
4106         device_unregister(&rbd_dev->dev);
4107 }
4108
4109 static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
4110
4111 /*
4112  * Get a unique rbd identifier for the given new rbd_dev, and add
4113  * the rbd_dev to the global list.  The minimum rbd id is 1.
4114  */
4115 static void rbd_dev_id_get(struct rbd_device *rbd_dev)
4116 {
4117         rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
4118
4119         spin_lock(&rbd_dev_list_lock);
4120         list_add_tail(&rbd_dev->node, &rbd_dev_list);
4121         spin_unlock(&rbd_dev_list_lock);
4122         dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4123                 (unsigned long long) rbd_dev->dev_id);
4124 }
4125
4126 /*
4127  * Remove an rbd_dev from the global list, and record that its
4128  * identifier is no longer in use.
4129  */
4130 static void rbd_dev_id_put(struct rbd_device *rbd_dev)
4131 {
4132         struct list_head *tmp;
4133         int rbd_id = rbd_dev->dev_id;
4134         int max_id;
4135
4136         rbd_assert(rbd_id > 0);
4137
4138         dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4139                 (unsigned long long) rbd_dev->dev_id);
4140         spin_lock(&rbd_dev_list_lock);
4141         list_del_init(&rbd_dev->node);
4142
4143         /*
4144          * If the id being "put" is not the current maximum, there
4145          * is nothing special we need to do.
4146          */
4147         if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
4148                 spin_unlock(&rbd_dev_list_lock);
4149                 return;
4150         }
4151
4152         /*
4153          * We need to update the current maximum id.  Search the
4154          * list to find out what it is.  We're more likely to find
4155          * the maximum at the end, so search the list backward.
4156          */
4157         max_id = 0;
4158         list_for_each_prev(tmp, &rbd_dev_list) {
4159                 struct rbd_device *rbd_dev;
4160
4161                 rbd_dev = list_entry(tmp, struct rbd_device, node);
4162                 if (rbd_dev->dev_id > max_id)
4163                         max_id = rbd_dev->dev_id;
4164         }
4165         spin_unlock(&rbd_dev_list_lock);
4166
4167         /*
4168          * The max id could have been updated by rbd_dev_id_get(), in
4169          * which case it now accurately reflects the new maximum.
4170          * Be careful not to overwrite the maximum value in that
4171          * case.
4172          */
4173         atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4174         dout("  max dev id has been reset\n");
4175 }
4176
4177 /*
4178  * Skips over white space at *buf, and updates *buf to point to the
4179  * first found non-space character (if any). Returns the length of
4180  * the token (string of non-white space characters) found.  Note
4181  * that *buf must be terminated with '\0'.
4182  */
4183 static inline size_t next_token(const char **buf)
4184 {
4185         /*
4186         * These are the characters that produce nonzero for
4187         * isspace() in the "C" and "POSIX" locales.
4188         */
4189         const char *spaces = " \f\n\r\t\v";
4190
4191         *buf += strspn(*buf, spaces);   /* Find start of token */
4192
4193         return strcspn(*buf, spaces);   /* Return token length */
4194 }
4195
4196 /*
4197  * Finds the next token in *buf, and if the provided token buffer is
4198  * big enough, copies the found token into it.  The result, if
4199  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4200  * must be terminated with '\0' on entry.
4201  *
4202  * Returns the length of the token found (not including the '\0').
4203  * Return value will be 0 if no token is found, and it will be >=
4204  * token_size if the token would not fit.
4205  *
4206  * The *buf pointer will be updated to point beyond the end of the
4207  * found token.  Note that this occurs even if the token buffer is
4208  * too small to hold it.
4209  */
4210 static inline size_t copy_token(const char **buf,
4211                                 char *token,
4212                                 size_t token_size)
4213 {
4214         size_t len;
4215
4216         len = next_token(buf);
4217         if (len < token_size) {
4218                 memcpy(token, *buf, len);
4219                 *(token + len) = '\0';
4220         }
4221         *buf += len;
4222
4223         return len;
4224 }
4225
4226 /*
4227  * Finds the next token in *buf, dynamically allocates a buffer big
4228  * enough to hold a copy of it, and copies the token into the new
4229  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4230  * that a duplicate buffer is created even for a zero-length token.
4231  *
4232  * Returns a pointer to the newly-allocated duplicate, or a null
4233  * pointer if memory for the duplicate was not available.  If
4234  * the lenp argument is a non-null pointer, the length of the token
4235  * (not including the '\0') is returned in *lenp.
4236  *
4237  * If successful, the *buf pointer will be updated to point beyond
4238  * the end of the found token.
4239  *
4240  * Note: uses GFP_KERNEL for allocation.
4241  */
4242 static inline char *dup_token(const char **buf, size_t *lenp)
4243 {
4244         char *dup;
4245         size_t len;
4246
4247         len = next_token(buf);
4248         dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4249         if (!dup)
4250                 return NULL;
4251         *(dup + len) = '\0';
4252         *buf += len;
4253
4254         if (lenp)
4255                 *lenp = len;
4256
4257         return dup;
4258 }
4259
4260 /*
4261  * Parse the options provided for an "rbd add" (i.e., rbd image
4262  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4263  * and the data written is passed here via a NUL-terminated buffer.
4264  * Returns 0 if successful or an error code otherwise.
4265  *
4266  * The information extracted from these options is recorded in
4267  * the other parameters which return dynamically-allocated
4268  * structures:
4269  *  ceph_opts
4270  *      The address of a pointer that will refer to a ceph options
4271  *      structure.  Caller must release the returned pointer using
4272  *      ceph_destroy_options() when it is no longer needed.
4273  *  rbd_opts
4274  *      Address of an rbd options pointer.  Fully initialized by
4275  *      this function; caller must release with kfree().
4276  *  spec
4277  *      Address of an rbd image specification pointer.  Fully
4278  *      initialized by this function based on parsed options.
4279  *      Caller must release with rbd_spec_put().
4280  *
4281  * The options passed take this form:
4282  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4283  * where:
4284  *  <mon_addrs>
4285  *      A comma-separated list of one or more monitor addresses.
4286  *      A monitor address is an ip address, optionally followed
4287  *      by a port number (separated by a colon).
4288  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4289  *  <options>
4290  *      A comma-separated list of ceph and/or rbd options.
4291  *  <pool_name>
4292  *      The name of the rados pool containing the rbd image.
4293  *  <image_name>
4294  *      The name of the image in that pool to map.
4295  *  <snap_id>
4296  *      An optional snapshot id.  If provided, the mapping will
4297  *      present data from the image at the time that snapshot was
4298  *      created.  The image head is used if no snapshot id is
4299  *      provided.  Snapshot mappings are always read-only.
4300  */
4301 static int rbd_add_parse_args(const char *buf,
4302                                 struct ceph_options **ceph_opts,
4303                                 struct rbd_options **opts,
4304                                 struct rbd_spec **rbd_spec)
4305 {
4306         size_t len;
4307         char *options;
4308         const char *mon_addrs;
4309         char *snap_name;
4310         size_t mon_addrs_size;
4311         struct rbd_spec *spec = NULL;
4312         struct rbd_options *rbd_opts = NULL;
4313         struct ceph_options *copts;
4314         int ret;
4315
4316         /* The first four tokens are required */
4317
4318         len = next_token(&buf);
4319         if (!len) {
4320                 rbd_warn(NULL, "no monitor address(es) provided");
4321                 return -EINVAL;
4322         }
4323         mon_addrs = buf;
4324         mon_addrs_size = len + 1;
4325         buf += len;
4326
4327         ret = -EINVAL;
4328         options = dup_token(&buf, NULL);
4329         if (!options)
4330                 return -ENOMEM;
4331         if (!*options) {
4332                 rbd_warn(NULL, "no options provided");
4333                 goto out_err;
4334         }
4335
4336         spec = rbd_spec_alloc();
4337         if (!spec)
4338                 goto out_mem;
4339
4340         spec->pool_name = dup_token(&buf, NULL);
4341         if (!spec->pool_name)
4342                 goto out_mem;
4343         if (!*spec->pool_name) {
4344                 rbd_warn(NULL, "no pool name provided");
4345                 goto out_err;
4346         }
4347
4348         spec->image_name = dup_token(&buf, NULL);
4349         if (!spec->image_name)
4350                 goto out_mem;
4351         if (!*spec->image_name) {
4352                 rbd_warn(NULL, "no image name provided");
4353                 goto out_err;
4354         }
4355
4356         /*
4357          * Snapshot name is optional; default is to use "-"
4358          * (indicating the head/no snapshot).
4359          */
4360         len = next_token(&buf);
4361         if (!len) {
4362                 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
4363                 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4364         } else if (len > RBD_MAX_SNAP_NAME_LEN) {
4365                 ret = -ENAMETOOLONG;
4366                 goto out_err;
4367         }
4368         snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4369         if (!snap_name)
4370                 goto out_mem;
4371         *(snap_name + len) = '\0';
4372         spec->snap_name = snap_name;
4373
4374         /* Initialize all rbd options to the defaults */
4375
4376         rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
4377         if (!rbd_opts)
4378                 goto out_mem;
4379
4380         rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4381
4382         copts = ceph_parse_options(options, mon_addrs,
4383                                         mon_addrs + mon_addrs_size - 1,
4384                                         parse_rbd_opts_token, rbd_opts);
4385         if (IS_ERR(copts)) {
4386                 ret = PTR_ERR(copts);
4387                 goto out_err;
4388         }
4389         kfree(options);
4390
4391         *ceph_opts = copts;
4392         *opts = rbd_opts;
4393         *rbd_spec = spec;
4394
4395         return 0;
4396 out_mem:
4397         ret = -ENOMEM;
4398 out_err:
4399         kfree(rbd_opts);
4400         rbd_spec_put(spec);
4401         kfree(options);
4402
4403         return ret;
4404 }
4405
4406 /*
4407  * An rbd format 2 image has a unique identifier, distinct from the
4408  * name given to it by the user.  Internally, that identifier is
4409  * what's used to specify the names of objects related to the image.
4410  *
4411  * A special "rbd id" object is used to map an rbd image name to its
4412  * id.  If that object doesn't exist, then there is no v2 rbd image
4413  * with the supplied name.
4414  *
4415  * This function will record the given rbd_dev's image_id field if
4416  * it can be determined, and in that case will return 0.  If any
4417  * errors occur a negative errno will be returned and the rbd_dev's
4418  * image_id field will be unchanged (and should be NULL).
4419  */
4420 static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4421 {
4422         int ret;
4423         size_t size;
4424         char *object_name;
4425         void *response;
4426         char *image_id;
4427
4428         /*
4429          * When probing a parent image, the image id is already
4430          * known (and the image name likely is not).  There's no
4431          * need to fetch the image id again in this case.  We
4432          * do still need to set the image format though.
4433          */
4434         if (rbd_dev->spec->image_id) {
4435                 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4436
4437                 return 0;
4438         }
4439
4440         /*
4441          * First, see if the format 2 image id file exists, and if
4442          * so, get the image's persistent id from it.
4443          */
4444         size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4445         object_name = kmalloc(size, GFP_NOIO);
4446         if (!object_name)
4447                 return -ENOMEM;
4448         sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4449         dout("rbd id object name is %s\n", object_name);
4450
4451         /* Response will be an encoded string, which includes a length */
4452
4453         size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4454         response = kzalloc(size, GFP_NOIO);
4455         if (!response) {
4456                 ret = -ENOMEM;
4457                 goto out;
4458         }
4459
4460         /* If it doesn't exist we'll assume it's a format 1 image */
4461
4462         ret = rbd_obj_method_sync(rbd_dev, object_name,
4463                                 "rbd", "get_id", NULL, 0,
4464                                 response, RBD_IMAGE_ID_LEN_MAX);
4465         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4466         if (ret == -ENOENT) {
4467                 image_id = kstrdup("", GFP_KERNEL);
4468                 ret = image_id ? 0 : -ENOMEM;
4469                 if (!ret)
4470                         rbd_dev->image_format = 1;
4471         } else if (ret > sizeof (__le32)) {
4472                 void *p = response;
4473
4474                 image_id = ceph_extract_encoded_string(&p, p + ret,
4475                                                 NULL, GFP_NOIO);
4476                 ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4477                 if (!ret)
4478                         rbd_dev->image_format = 2;
4479         } else {
4480                 ret = -EINVAL;
4481         }
4482
4483         if (!ret) {
4484                 rbd_dev->spec->image_id = image_id;
4485                 dout("image_id is %s\n", image_id);
4486         }
4487 out:
4488         kfree(response);
4489         kfree(object_name);
4490
4491         return ret;
4492 }
4493
4494 /* Undo whatever state changes are made by v1 or v2 image probe */
4495
4496 static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
4497 {
4498         struct rbd_image_header *header;
4499
4500         rbd_dev_remove_parent(rbd_dev);
4501         rbd_spec_put(rbd_dev->parent_spec);
4502         rbd_dev->parent_spec = NULL;
4503         rbd_dev->parent_overlap = 0;
4504
4505         /* Free dynamic fields from the header, then zero it out */
4506
4507         header = &rbd_dev->header;
4508         ceph_put_snap_context(header->snapc);
4509         kfree(header->snap_sizes);
4510         kfree(header->snap_names);
4511         kfree(header->object_prefix);
4512         memset(header, 0, sizeof (*header));
4513 }
4514
4515 static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4516 {
4517         int ret;
4518
4519         /* Populate rbd image metadata */
4520
4521         ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4522         if (ret < 0)
4523                 goto out_err;
4524
4525         /* Version 1 images have no parent (no layering) */
4526
4527         rbd_dev->parent_spec = NULL;
4528         rbd_dev->parent_overlap = 0;
4529
4530         dout("discovered version 1 image, header name is %s\n",
4531                 rbd_dev->header_name);
4532
4533         return 0;
4534
4535 out_err:
4536         kfree(rbd_dev->header_name);
4537         rbd_dev->header_name = NULL;
4538         kfree(rbd_dev->spec->image_id);
4539         rbd_dev->spec->image_id = NULL;
4540
4541         return ret;
4542 }
4543
4544 static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4545 {
4546         int ret;
4547
4548         ret = rbd_dev_v2_image_size(rbd_dev);
4549         if (ret)
4550                 goto out_err;
4551
4552         /* Get the object prefix (a.k.a. block_name) for the image */
4553
4554         ret = rbd_dev_v2_object_prefix(rbd_dev);
4555         if (ret)
4556                 goto out_err;
4557
4558         /* Get the and check features for the image */
4559
4560         ret = rbd_dev_v2_features(rbd_dev);
4561         if (ret)
4562                 goto out_err;
4563
4564         /* If the image supports layering, get the parent info */
4565
4566         if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
4567                 ret = rbd_dev_v2_parent_info(rbd_dev);
4568                 if (ret)
4569                         goto out_err;
4570                 /*
4571                  * Print a warning if this image has a parent.
4572                  * Don't print it if the image now being probed
4573                  * is itself a parent.  We can tell at this point
4574                  * because we won't know its pool name yet (just its
4575                  * pool id).
4576                  */
4577                 if (rbd_dev->parent_spec && rbd_dev->spec->pool_name)
4578                         rbd_warn(rbd_dev, "WARNING: kernel layering "
4579                                         "is EXPERIMENTAL!");
4580         }
4581
4582         /* If the image supports fancy striping, get its parameters */
4583
4584         if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4585                 ret = rbd_dev_v2_striping_info(rbd_dev);
4586                 if (ret < 0)
4587                         goto out_err;
4588         }
4589
4590         /* crypto and compression type aren't (yet) supported for v2 images */
4591
4592         rbd_dev->header.crypt_type = 0;
4593         rbd_dev->header.comp_type = 0;
4594
4595         /* Get the snapshot context, plus the header version */
4596
4597         ret = rbd_dev_v2_snap_context(rbd_dev);
4598         if (ret)
4599                 goto out_err;
4600
4601         dout("discovered version 2 image, header name is %s\n",
4602                 rbd_dev->header_name);
4603
4604         return 0;
4605 out_err:
4606         rbd_dev->parent_overlap = 0;
4607         rbd_spec_put(rbd_dev->parent_spec);
4608         rbd_dev->parent_spec = NULL;
4609         kfree(rbd_dev->header_name);
4610         rbd_dev->header_name = NULL;
4611         kfree(rbd_dev->header.object_prefix);
4612         rbd_dev->header.object_prefix = NULL;
4613
4614         return ret;
4615 }
4616
4617 static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
4618 {
4619         struct rbd_device *parent = NULL;
4620         struct rbd_spec *parent_spec;
4621         struct rbd_client *rbdc;
4622         int ret;
4623
4624         if (!rbd_dev->parent_spec)
4625                 return 0;
4626         /*
4627          * We need to pass a reference to the client and the parent
4628          * spec when creating the parent rbd_dev.  Images related by
4629          * parent/child relationships always share both.
4630          */
4631         parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4632         rbdc = __rbd_get_client(rbd_dev->rbd_client);
4633
4634         ret = -ENOMEM;
4635         parent = rbd_dev_create(rbdc, parent_spec);
4636         if (!parent)
4637                 goto out_err;
4638
4639         ret = rbd_dev_image_probe(parent);
4640         if (ret < 0)
4641                 goto out_err;
4642         rbd_dev->parent = parent;
4643
4644         return 0;
4645 out_err:
4646         if (parent) {
4647                 rbd_spec_put(rbd_dev->parent_spec);
4648                 kfree(rbd_dev->header_name);
4649                 rbd_dev_destroy(parent);
4650         } else {
4651                 rbd_put_client(rbdc);
4652                 rbd_spec_put(parent_spec);
4653         }
4654
4655         return ret;
4656 }
4657
4658 static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
4659 {
4660         int ret;
4661
4662         ret = rbd_dev_mapping_set(rbd_dev);
4663         if (ret)
4664                 return ret;
4665
4666         /* generate unique id: find highest unique id, add one */
4667         rbd_dev_id_get(rbd_dev);
4668
4669         /* Fill in the device name, now that we have its id. */
4670         BUILD_BUG_ON(DEV_NAME_LEN
4671                         < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4672         sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4673
4674         /* Get our block major device number. */
4675
4676         ret = register_blkdev(0, rbd_dev->name);
4677         if (ret < 0)
4678                 goto err_out_id;
4679         rbd_dev->major = ret;
4680
4681         /* Set up the blkdev mapping. */
4682
4683         ret = rbd_init_disk(rbd_dev);
4684         if (ret)
4685                 goto err_out_blkdev;
4686
4687         ret = rbd_bus_add_dev(rbd_dev);
4688         if (ret)
4689                 goto err_out_disk;
4690
4691         /* Everything's ready.  Announce the disk to the world. */
4692
4693         set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
4694         set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4695         add_disk(rbd_dev->disk);
4696
4697         pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4698                 (unsigned long long) rbd_dev->mapping.size);
4699
4700         return ret;
4701
4702 err_out_disk:
4703         rbd_free_disk(rbd_dev);
4704 err_out_blkdev:
4705         unregister_blkdev(rbd_dev->major, rbd_dev->name);
4706 err_out_id:
4707         rbd_dev_id_put(rbd_dev);
4708         rbd_dev_mapping_clear(rbd_dev);
4709
4710         return ret;
4711 }
4712
4713 static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4714 {
4715         struct rbd_spec *spec = rbd_dev->spec;
4716         size_t size;
4717
4718         /* Record the header object name for this rbd image. */
4719
4720         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4721
4722         if (rbd_dev->image_format == 1)
4723                 size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4724         else
4725                 size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4726
4727         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4728         if (!rbd_dev->header_name)
4729                 return -ENOMEM;
4730
4731         if (rbd_dev->image_format == 1)
4732                 sprintf(rbd_dev->header_name, "%s%s",
4733                         spec->image_name, RBD_SUFFIX);
4734         else
4735                 sprintf(rbd_dev->header_name, "%s%s",
4736                         RBD_HEADER_PREFIX, spec->image_id);
4737         return 0;
4738 }
4739
4740 static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4741 {
4742         int ret;
4743
4744         rbd_dev_unprobe(rbd_dev);
4745         ret = rbd_dev_header_watch_sync(rbd_dev, 0);
4746         if (ret)
4747                 rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
4748         kfree(rbd_dev->header_name);
4749         rbd_dev->header_name = NULL;
4750         rbd_dev->image_format = 0;
4751         kfree(rbd_dev->spec->image_id);
4752         rbd_dev->spec->image_id = NULL;
4753
4754         rbd_dev_destroy(rbd_dev);
4755 }
4756
4757 /*
4758  * Probe for the existence of the header object for the given rbd
4759  * device.  For format 2 images this includes determining the image
4760  * id.
4761  */
4762 static int rbd_dev_image_probe(struct rbd_device *rbd_dev)
4763 {
4764         int ret;
4765         int tmp;
4766
4767         /*
4768          * Get the id from the image id object.  If it's not a
4769          * format 2 image, we'll get ENOENT back, and we'll assume
4770          * it's a format 1 image.
4771          */
4772         ret = rbd_dev_image_id(rbd_dev);
4773         if (ret)
4774                 return ret;
4775         rbd_assert(rbd_dev->spec->image_id);
4776         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4777
4778         ret = rbd_dev_header_name(rbd_dev);
4779         if (ret)
4780                 goto err_out_format;
4781
4782         ret = rbd_dev_header_watch_sync(rbd_dev, 1);
4783         if (ret)
4784                 goto out_header_name;
4785
4786         if (rbd_dev->image_format == 1)
4787                 ret = rbd_dev_v1_probe(rbd_dev);
4788         else
4789                 ret = rbd_dev_v2_probe(rbd_dev);
4790         if (ret)
4791                 goto err_out_watch;
4792
4793         ret = rbd_dev_spec_update(rbd_dev);
4794         if (ret)
4795                 goto err_out_probe;
4796
4797         ret = rbd_dev_probe_parent(rbd_dev);
4798         if (!ret)
4799                 return 0;
4800
4801 err_out_probe:
4802         rbd_dev_unprobe(rbd_dev);
4803 err_out_watch:
4804         tmp = rbd_dev_header_watch_sync(rbd_dev, 0);
4805         if (tmp)
4806                 rbd_warn(rbd_dev, "unable to tear down watch request\n");
4807 out_header_name:
4808         kfree(rbd_dev->header_name);
4809         rbd_dev->header_name = NULL;
4810 err_out_format:
4811         rbd_dev->image_format = 0;
4812         kfree(rbd_dev->spec->image_id);
4813         rbd_dev->spec->image_id = NULL;
4814
4815         dout("probe failed, returning %d\n", ret);
4816
4817         return ret;
4818 }
4819
4820 static ssize_t rbd_add(struct bus_type *bus,
4821                        const char *buf,
4822                        size_t count)
4823 {
4824         struct rbd_device *rbd_dev = NULL;
4825         struct ceph_options *ceph_opts = NULL;
4826         struct rbd_options *rbd_opts = NULL;
4827         struct rbd_spec *spec = NULL;
4828         struct rbd_client *rbdc;
4829         struct ceph_osd_client *osdc;
4830         int rc = -ENOMEM;
4831
4832         if (!try_module_get(THIS_MODULE))
4833                 return -ENODEV;
4834
4835         /* parse add command */
4836         rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4837         if (rc < 0)
4838                 goto err_out_module;
4839
4840         rbdc = rbd_get_client(ceph_opts);
4841         if (IS_ERR(rbdc)) {
4842                 rc = PTR_ERR(rbdc);
4843                 goto err_out_args;
4844         }
4845         ceph_opts = NULL;       /* rbd_dev client now owns this */
4846
4847         /* pick the pool */
4848         osdc = &rbdc->client->osdc;
4849         rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4850         if (rc < 0)
4851                 goto err_out_client;
4852         spec->pool_id = (u64)rc;
4853
4854         /* The ceph file layout needs to fit pool id in 32 bits */
4855
4856         if (spec->pool_id > (u64)U32_MAX) {
4857                 rbd_warn(NULL, "pool id too large (%llu > %u)\n",
4858                                 (unsigned long long)spec->pool_id, U32_MAX);
4859                 rc = -EIO;
4860                 goto err_out_client;
4861         }
4862
4863         rbd_dev = rbd_dev_create(rbdc, spec);
4864         if (!rbd_dev)
4865                 goto err_out_client;
4866         rbdc = NULL;            /* rbd_dev now owns this */
4867         spec = NULL;            /* rbd_dev now owns this */
4868
4869         rbd_dev->mapping.read_only = rbd_opts->read_only;
4870         kfree(rbd_opts);
4871         rbd_opts = NULL;        /* done with this */
4872
4873         rc = rbd_dev_image_probe(rbd_dev);
4874         if (rc < 0)
4875                 goto err_out_rbd_dev;
4876
4877         rc = rbd_dev_device_setup(rbd_dev);
4878         if (!rc)
4879                 return count;
4880
4881         rbd_dev_image_release(rbd_dev);
4882 err_out_rbd_dev:
4883         rbd_dev_destroy(rbd_dev);
4884 err_out_client:
4885         rbd_put_client(rbdc);
4886 err_out_args:
4887         if (ceph_opts)
4888                 ceph_destroy_options(ceph_opts);
4889         kfree(rbd_opts);
4890         rbd_spec_put(spec);
4891 err_out_module:
4892         module_put(THIS_MODULE);
4893
4894         dout("Error adding device %s\n", buf);
4895
4896         return (ssize_t)rc;
4897 }
4898
4899 static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4900 {
4901         struct list_head *tmp;
4902         struct rbd_device *rbd_dev;
4903
4904         spin_lock(&rbd_dev_list_lock);
4905         list_for_each(tmp, &rbd_dev_list) {
4906                 rbd_dev = list_entry(tmp, struct rbd_device, node);
4907                 if (rbd_dev->dev_id == dev_id) {
4908                         spin_unlock(&rbd_dev_list_lock);
4909                         return rbd_dev;
4910                 }
4911         }
4912         spin_unlock(&rbd_dev_list_lock);
4913         return NULL;
4914 }
4915
4916 static void rbd_dev_device_release(struct device *dev)
4917 {
4918         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4919
4920         rbd_free_disk(rbd_dev);
4921         clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4922         rbd_dev_clear_mapping(rbd_dev);
4923         unregister_blkdev(rbd_dev->major, rbd_dev->name);
4924         rbd_dev->major = 0;
4925         rbd_dev_id_put(rbd_dev);
4926         rbd_dev_mapping_clear(rbd_dev);
4927 }
4928
4929 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
4930 {
4931         while (rbd_dev->parent) {
4932                 struct rbd_device *first = rbd_dev;
4933                 struct rbd_device *second = first->parent;
4934                 struct rbd_device *third;
4935
4936                 /*
4937                  * Follow to the parent with no grandparent and
4938                  * remove it.
4939                  */
4940                 while (second && (third = second->parent)) {
4941                         first = second;
4942                         second = third;
4943                 }
4944                 rbd_assert(second);
4945                 rbd_dev_image_release(second);
4946                 first->parent = NULL;
4947                 first->parent_overlap = 0;
4948
4949                 rbd_assert(first->parent_spec);
4950                 rbd_spec_put(first->parent_spec);
4951                 first->parent_spec = NULL;
4952         }
4953 }
4954
4955 static ssize_t rbd_remove(struct bus_type *bus,
4956                           const char *buf,
4957                           size_t count)
4958 {
4959         struct rbd_device *rbd_dev = NULL;
4960         int target_id;
4961         unsigned long ul;
4962         int ret;
4963
4964         ret = strict_strtoul(buf, 10, &ul);
4965         if (ret)
4966                 return ret;
4967
4968         /* convert to int; abort if we lost anything in the conversion */
4969         target_id = (int) ul;
4970         if (target_id != ul)
4971                 return -EINVAL;
4972
4973         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4974
4975         rbd_dev = __rbd_get_dev(target_id);
4976         if (!rbd_dev) {
4977                 ret = -ENOENT;
4978                 goto done;
4979         }
4980
4981         spin_lock_irq(&rbd_dev->lock);
4982         if (rbd_dev->open_count)
4983                 ret = -EBUSY;
4984         else
4985                 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4986         spin_unlock_irq(&rbd_dev->lock);
4987         if (ret < 0)
4988                 goto done;
4989         ret = count;
4990         rbd_bus_del_dev(rbd_dev);
4991         rbd_dev_image_release(rbd_dev);
4992         module_put(THIS_MODULE);
4993 done:
4994         mutex_unlock(&ctl_mutex);
4995
4996         return ret;
4997 }
4998
4999 /*
5000  * create control files in sysfs
5001  * /sys/bus/rbd/...
5002  */
5003 static int rbd_sysfs_init(void)
5004 {
5005         int ret;
5006
5007         ret = device_register(&rbd_root_dev);
5008         if (ret < 0)
5009                 return ret;
5010
5011         ret = bus_register(&rbd_bus_type);
5012         if (ret < 0)
5013                 device_unregister(&rbd_root_dev);
5014
5015         return ret;
5016 }
5017
5018 static void rbd_sysfs_cleanup(void)
5019 {
5020         bus_unregister(&rbd_bus_type);
5021         device_unregister(&rbd_root_dev);
5022 }
5023
5024 static int rbd_slab_init(void)
5025 {
5026         rbd_assert(!rbd_img_request_cache);
5027         rbd_img_request_cache = kmem_cache_create("rbd_img_request",
5028                                         sizeof (struct rbd_img_request),
5029                                         __alignof__(struct rbd_img_request),
5030                                         0, NULL);
5031         if (!rbd_img_request_cache)
5032                 return -ENOMEM;
5033
5034         rbd_assert(!rbd_obj_request_cache);
5035         rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5036                                         sizeof (struct rbd_obj_request),
5037                                         __alignof__(struct rbd_obj_request),
5038                                         0, NULL);
5039         if (!rbd_obj_request_cache)
5040                 goto out_err;
5041
5042         rbd_assert(!rbd_segment_name_cache);
5043         rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
5044                                         MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL);
5045         if (rbd_segment_name_cache)
5046                 return 0;
5047 out_err:
5048         if (rbd_obj_request_cache) {
5049                 kmem_cache_destroy(rbd_obj_request_cache);
5050                 rbd_obj_request_cache = NULL;
5051         }
5052
5053         kmem_cache_destroy(rbd_img_request_cache);
5054         rbd_img_request_cache = NULL;
5055
5056         return -ENOMEM;
5057 }
5058
5059 static void rbd_slab_exit(void)
5060 {
5061         rbd_assert(rbd_segment_name_cache);
5062         kmem_cache_destroy(rbd_segment_name_cache);
5063         rbd_segment_name_cache = NULL;
5064
5065         rbd_assert(rbd_obj_request_cache);
5066         kmem_cache_destroy(rbd_obj_request_cache);
5067         rbd_obj_request_cache = NULL;
5068
5069         rbd_assert(rbd_img_request_cache);
5070         kmem_cache_destroy(rbd_img_request_cache);
5071         rbd_img_request_cache = NULL;
5072 }
5073
5074 static int __init rbd_init(void)
5075 {
5076         int rc;
5077
5078         if (!libceph_compatible(NULL)) {
5079                 rbd_warn(NULL, "libceph incompatibility (quitting)");
5080
5081                 return -EINVAL;
5082         }
5083         rc = rbd_slab_init();
5084         if (rc)
5085                 return rc;
5086         rc = rbd_sysfs_init();
5087         if (rc)
5088                 rbd_slab_exit();
5089         else
5090                 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
5091
5092         return rc;
5093 }
5094
5095 static void __exit rbd_exit(void)
5096 {
5097         rbd_sysfs_cleanup();
5098         rbd_slab_exit();
5099 }
5100
5101 module_init(rbd_init);
5102 module_exit(rbd_exit);
5103
5104 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5105 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5106 MODULE_DESCRIPTION("rados block device");
5107
5108 /* following authorship retained from original osdblk.c */
5109 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5110
5111 MODULE_LICENSE("GPL");