#define SECTOR_SHIFT 9
#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
+/*
+ * Increment the given counter and return its updated value.
+ * If the counter is already 0 it will not be incremented.
+ * If the counter is already at its maximum value returns
+ * -EINVAL without updating it.
+ */
+static int atomic_inc_return_safe(atomic_t *v)
+{
+ unsigned int counter;
+
+ counter = (unsigned int)__atomic_add_unless(v, 1, 0);
+ if (counter <= (unsigned int)INT_MAX)
+ return (int)counter;
+
+ atomic_dec(v);
+
+ return -EINVAL;
+}
+
+/* Decrement the counter. Return the resulting value, or -EINVAL */
+static int atomic_dec_return_safe(atomic_t *v)
+{
+ int counter;
+
+ counter = atomic_dec_return(v);
+ if (counter >= 0)
+ return counter;
+
+ atomic_inc(v);
+
+ return -EINVAL;
+}
+
#define RBD_DRV_NAME "rbd"
#define RBD_DRV_NAME_LONG "rbd (rados block device)"
struct rbd_spec *parent_spec;
u64 parent_overlap;
+ atomic_t parent_ref;
struct rbd_device *parent;
/* protects updating the header */
static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
size_t count);
static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
+static void rbd_spec_put(struct rbd_spec *spec);
static struct bus_attribute rbd_bus_attrs[] = {
__ATTR(add, S_IWUSR, NULL, rbd_add),
kref_put(&obj_request->kref, rbd_obj_request_destroy);
}
+static bool img_request_child_test(struct rbd_img_request *img_request);
+static void rbd_parent_request_destroy(struct kref *kref);
static void rbd_img_request_destroy(struct kref *kref);
static void rbd_img_request_put(struct rbd_img_request *img_request)
{
rbd_assert(img_request != NULL);
dout("%s: img %p (was %d)\n", __func__, img_request,
atomic_read(&img_request->kref.refcount));
- kref_put(&img_request->kref, rbd_img_request_destroy);
+ if (img_request_child_test(img_request))
+ kref_put(&img_request->kref, rbd_parent_request_destroy);
+ else
+ kref_put(&img_request->kref, rbd_img_request_destroy);
}
static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
smp_mb();
}
+static void img_request_child_clear(struct rbd_img_request *img_request)
+{
+ clear_bit(IMG_REQ_CHILD, &img_request->flags);
+ smp_mb();
+}
+
static bool img_request_child_test(struct rbd_img_request *img_request)
{
smp_mb();
smp_mb();
}
+static void img_request_layered_clear(struct rbd_img_request *img_request)
+{
+ clear_bit(IMG_REQ_LAYERED, &img_request->flags);
+ smp_mb();
+}
+
static bool img_request_layered_test(struct rbd_img_request *img_request)
{
smp_mb();
kmem_cache_free(rbd_obj_request_cache, obj_request);
}
+/* It's OK to call this for a device with no parent */
+
+static void rbd_spec_put(struct rbd_spec *spec);
+static void rbd_dev_unparent(struct rbd_device *rbd_dev)
+{
+ rbd_dev_remove_parent(rbd_dev);
+ rbd_spec_put(rbd_dev->parent_spec);
+ rbd_dev->parent_spec = NULL;
+ rbd_dev->parent_overlap = 0;
+}
+
+/*
+ * Parent image reference counting is used to determine when an
+ * image's parent fields can be safely torn down--after there are no
+ * more in-flight requests to the parent image. When the last
+ * reference is dropped, cleaning them up is safe.
+ */
+static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
+{
+ int counter;
+
+ if (!rbd_dev->parent_spec)
+ return;
+
+ counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
+ if (counter > 0)
+ return;
+
+ /* Last reference; clean up parent data structures */
+
+ if (!counter)
+ rbd_dev_unparent(rbd_dev);
+ else
+ rbd_warn(rbd_dev, "parent reference underflow\n");
+}
+
+/*
+ * If an image has a non-zero parent overlap, get a reference to its
+ * parent.
+ *
+ * We must get the reference before checking for the overlap to
+ * coordinate properly with zeroing the parent overlap in
+ * rbd_dev_v2_parent_info() when an image gets flattened. We
+ * drop it again if there is no overlap.
+ *
+ * Returns true if the rbd device has a parent with a non-zero
+ * overlap and a reference for it was successfully taken, or
+ * false otherwise.
+ */
+static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
+{
+ int counter;
+
+ if (!rbd_dev->parent_spec)
+ return false;
+
+ counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
+ if (counter > 0 && rbd_dev->parent_overlap)
+ return true;
+
+ /* Image was flattened, but parent is not yet torn down */
+
+ if (counter < 0)
+ rbd_warn(rbd_dev, "parent reference overflow\n");
+
+ return false;
+}
+
/*
* Caller is responsible for filling in the list of object requests
* that comprises the image request, and the Linux request pointer
static struct rbd_img_request *rbd_img_request_create(
struct rbd_device *rbd_dev,
u64 offset, u64 length,
- bool write_request,
- bool child_request)
+ bool write_request)
{
struct rbd_img_request *img_request;
} else {
img_request->snap_id = rbd_dev->spec->snap_id;
}
- if (child_request)
- img_request_child_set(img_request);
- if (rbd_dev->parent_spec)
+ if (rbd_dev_parent_get(rbd_dev))
img_request_layered_set(img_request);
spin_lock_init(&img_request->completion_lock);
img_request->next_completion = 0;
rbd_img_obj_request_del(img_request, obj_request);
rbd_assert(img_request->obj_request_count == 0);
+ if (img_request_layered_test(img_request)) {
+ img_request_layered_clear(img_request);
+ rbd_dev_parent_put(img_request->rbd_dev);
+ }
+
if (img_request_write_test(img_request))
ceph_put_snap_context(img_request->snapc);
- if (img_request_child_test(img_request))
- rbd_obj_request_put(img_request->obj_request);
-
kmem_cache_free(rbd_img_request_cache, img_request);
}
+static struct rbd_img_request *rbd_parent_request_create(
+ struct rbd_obj_request *obj_request,
+ u64 img_offset, u64 length)
+{
+ struct rbd_img_request *parent_request;
+ struct rbd_device *rbd_dev;
+
+ rbd_assert(obj_request->img_request);
+ rbd_dev = obj_request->img_request->rbd_dev;
+
+ parent_request = rbd_img_request_create(rbd_dev->parent,
+ img_offset, length, false);
+ if (!parent_request)
+ return NULL;
+
+ img_request_child_set(parent_request);
+ rbd_obj_request_get(obj_request);
+ parent_request->obj_request = obj_request;
+
+ return parent_request;
+}
+
+static void rbd_parent_request_destroy(struct kref *kref)
+{
+ struct rbd_img_request *parent_request;
+ struct rbd_obj_request *orig_request;
+
+ parent_request = container_of(kref, struct rbd_img_request, kref);
+ orig_request = parent_request->obj_request;
+
+ parent_request->obj_request = NULL;
+ rbd_obj_request_put(orig_request);
+ img_request_child_clear(parent_request);
+
+ rbd_img_request_destroy(kref);
+}
+
static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
{
struct rbd_img_request *img_request;
u32 page_count;
int result;
u64 parent_length;
+ u64 offset;
+ u64 length;
rbd_assert(img_request_child_test(img_request));
orig_request = img_request->obj_request;
rbd_assert(orig_request != NULL);
- rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
+ rbd_assert(obj_request_type_valid(orig_request->type));
result = img_request->result;
parent_length = img_request->length;
rbd_assert(parent_length == img_request->xferred);
if (result)
goto out_err;
- /* Allocate the new copyup osd request for the original request */
-
+ /*
+ * The original osd request is of no use to use any more.
+ * We need a new one that can hold the two ops in a copyup
+ * request. Allocate the new copyup osd request for the
+ * original request, and release the old one.
+ */
result = -ENOMEM;
- rbd_assert(!orig_request->osd_req);
osd_req = rbd_osd_req_create_copyup(orig_request);
if (!osd_req)
goto out_err;
+ rbd_osd_req_destroy(orig_request->osd_req);
orig_request->osd_req = osd_req;
orig_request->copyup_pages = pages;
orig_request->copyup_page_count = page_count;
/* Then the original write request op */
+ offset = orig_request->offset;
+ length = orig_request->length;
osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
- orig_request->offset,
- orig_request->length, 0, 0);
- osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
- orig_request->length);
+ offset, length, 0, 0);
+ if (orig_request->type == OBJ_REQUEST_BIO)
+ osd_req_op_extent_osd_data_bio(osd_req, 1,
+ orig_request->bio_list, length);
+ else
+ osd_req_op_extent_osd_data_pages(osd_req, 1,
+ orig_request->pages, length,
+ offset & ~PAGE_MASK, false, false);
rbd_osd_req_format_write(orig_request);
int result;
rbd_assert(obj_request_img_data_test(obj_request));
- rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
+ rbd_assert(obj_request_type_valid(obj_request->type));
img_request = obj_request->img_request;
rbd_assert(img_request != NULL);
rbd_dev = img_request->rbd_dev;
rbd_assert(rbd_dev->parent != NULL);
- /*
- * First things first. The original osd request is of no
- * use to use any more, we'll need a new one that can hold
- * the two ops in a copyup request. We'll get that later,
- * but for now we can release the old one.
- */
- rbd_osd_req_destroy(obj_request->osd_req);
- obj_request->osd_req = NULL;
-
/*
* Determine the byte range covered by the object in the
* child image to which the original request was to be sent.
}
result = -ENOMEM;
- parent_request = rbd_img_request_create(rbd_dev->parent,
- img_offset, length,
- false, true);
+ parent_request = rbd_parent_request_create(obj_request,
+ img_offset, length);
if (!parent_request)
goto out_err;
- rbd_obj_request_get(obj_request);
- parent_request->obj_request = obj_request;
result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
if (result)
static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
{
- struct rbd_device *rbd_dev;
struct rbd_img_request *img_request;
int result;
rbd_assert(obj_request->result == (s32) -ENOENT);
rbd_assert(obj_request_type_valid(obj_request->type));
- rbd_dev = obj_request->img_request->rbd_dev;
- rbd_assert(rbd_dev->parent != NULL);
/* rbd_read_finish(obj_request, obj_request->length); */
- img_request = rbd_img_request_create(rbd_dev->parent,
+ img_request = rbd_parent_request_create(obj_request,
obj_request->img_offset,
- obj_request->length,
- false, true);
+ obj_request->length);
result = -ENOMEM;
if (!img_request)
goto out_err;
- rbd_obj_request_get(obj_request);
- img_request->obj_request = obj_request;
-
if (obj_request->type == OBJ_REQUEST_BIO)
result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
obj_request->bio_list);
result = -ENOMEM;
img_request = rbd_img_request_create(rbd_dev, offset, length,
- write_request, false);
+ write_request);
if (!img_request)
goto end_request;
spin_lock_init(&rbd_dev->lock);
rbd_dev->flags = 0;
+ atomic_set(&rbd_dev->parent_ref, 0);
INIT_LIST_HEAD(&rbd_dev->node);
init_rwsem(&rbd_dev->header_rwsem);
__le64 snapid;
void *p;
void *end;
+ u64 pool_id;
char *image_id;
u64 overlap;
int ret;
p = reply_buf;
end = reply_buf + ret;
ret = -ERANGE;
- ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
- if (parent_spec->pool_id == CEPH_NOPOOL)
+ ceph_decode_64_safe(&p, end, pool_id, out_err);
+ if (pool_id == CEPH_NOPOOL) {
+ /*
+ * Either the parent never existed, or we have
+ * record of it but the image got flattened so it no
+ * longer has a parent. When the parent of a
+ * layered image disappears we immediately set the
+ * overlap to 0. The effect of this is that all new
+ * requests will be treated as if the image had no
+ * parent.
+ */
+ if (rbd_dev->parent_overlap) {
+ rbd_dev->parent_overlap = 0;
+ smp_mb();
+ rbd_dev_parent_put(rbd_dev);
+ pr_info("%s: clone image has been flattened\n",
+ rbd_dev->disk->disk_name);
+ }
+
goto out; /* No parent? No problem. */
+ }
/* The ceph file layout needs to fit pool id in 32 bits */
ret = -EIO;
- if (parent_spec->pool_id > (u64)U32_MAX) {
+ if (pool_id > (u64)U32_MAX) {
rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
- (unsigned long long)parent_spec->pool_id, U32_MAX);
+ (unsigned long long)pool_id, U32_MAX);
goto out_err;
}
+ parent_spec->pool_id = pool_id;
image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
if (IS_ERR(image_id)) {
ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
ceph_decode_64_safe(&p, end, overlap, out_err);
- rbd_dev->parent_overlap = overlap;
- rbd_dev->parent_spec = parent_spec;
- parent_spec = NULL; /* rbd_dev now owns this */
+ if (overlap) {
+ rbd_spec_put(rbd_dev->parent_spec);
+ rbd_dev->parent_spec = parent_spec;
+ parent_spec = NULL; /* rbd_dev now owns this */
+ rbd_dev->parent_overlap = overlap;
+ } else {
+ rbd_warn(rbd_dev, "ignoring parent of clone with overlap 0\n");
+ }
out:
ret = 0;
out_err:
goto out;
}
+ /*
+ * If the image supports layering, get the parent info. We
+ * need to probe the first time regardless. Thereafter we
+ * only need to if there's a parent, to see if it has
+ * disappeared due to the mapped image getting flattened.
+ */
+ if (rbd_dev->header.features & RBD_FEATURE_LAYERING &&
+ (first_time || rbd_dev->parent_spec)) {
+ bool warn;
+
+ ret = rbd_dev_v2_parent_info(rbd_dev);
+ if (ret)
+ goto out;
+
+ /*
+ * Print a warning if this is the initial probe and
+ * the image has a parent. Don't print it if the
+ * image now being probed is itself a parent. We
+ * can tell at this point because we won't know its
+ * pool name yet (just its pool id).
+ */
+ warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name;
+ if (first_time && warn)
+ rbd_warn(rbd_dev, "WARNING: kernel layering "
+ "is EXPERIMENTAL!");
+ }
+
ret = rbd_dev_v2_image_size(rbd_dev);
if (ret)
goto out;
+
if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
if (rbd_dev->mapping.size != rbd_dev->header.image_size)
rbd_dev->mapping.size = rbd_dev->header.image_size;
ret = rbd_dev_v2_snap_context(rbd_dev);
dout("rbd_dev_v2_snap_context returned %d\n", ret);
- if (ret)
- goto out;
out:
up_write(&rbd_dev->header_rwsem);
{
struct rbd_image_header *header;
- rbd_dev_remove_parent(rbd_dev);
- rbd_spec_put(rbd_dev->parent_spec);
- rbd_dev->parent_spec = NULL;
- rbd_dev->parent_overlap = 0;
+ /* Drop parent reference unless it's already been done (or none) */
+
+ if (rbd_dev->parent_overlap)
+ rbd_dev_parent_put(rbd_dev);
/* Free dynamic fields from the header, then zero it out */
if (ret)
goto out_err;
- /* If the image supports layering, get the parent info */
-
- if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
- ret = rbd_dev_v2_parent_info(rbd_dev);
- if (ret)
- goto out_err;
- /*
- * Print a warning if this image has a parent.
- * Don't print it if the image now being probed
- * is itself a parent. We can tell at this point
- * because we won't know its pool name yet (just its
- * pool id).
- */
- if (rbd_dev->parent_spec && rbd_dev->spec->pool_name)
- rbd_warn(rbd_dev, "WARNING: kernel layering "
- "is EXPERIMENTAL!");
- }
-
/* If the image supports fancy striping, get its parameters */
if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
return 0;
out_err:
- rbd_dev->parent_overlap = 0;
- rbd_spec_put(rbd_dev->parent_spec);
- rbd_dev->parent_spec = NULL;
- kfree(rbd_dev->header_name);
- rbd_dev->header_name = NULL;
+ rbd_dev->header.features = 0;
kfree(rbd_dev->header.object_prefix);
rbd_dev->header.object_prefix = NULL;
if (ret < 0)
goto out_err;
rbd_dev->parent = parent;
+ atomic_set(&rbd_dev->parent_ref, 1);
return 0;
out_err:
if (parent) {
- rbd_spec_put(rbd_dev->parent_spec);
+ rbd_dev_unparent(rbd_dev);
kfree(rbd_dev->header_name);
rbd_dev_destroy(parent);
} else {