do_div(tmp, HZ);
bytes_allowed = tmp;
- if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
+ if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
if (wait)
*wait = 0;
return 1;
}
/* Calc approx time to dispatch */
- extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
+ extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
if (!jiffy_wait)
bool rw = bio_data_dir(bio);
/* Charge the bio to the group */
- tg->bytes_disp[rw] += bio->bi_size;
+ tg->bytes_disp[rw] += bio->bi_iter.bi_size;
tg->io_disp[rw]++;
/*
*/
if (!(bio->bi_rw & REQ_THROTTLED)) {
bio->bi_rw |= REQ_THROTTLED;
- throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size,
- bio->bi_rw);
+ throtl_update_dispatch_stats(tg_to_blkg(tg),
+ bio->bi_iter.bi_size, bio->bi_rw);
}
}
return __blkg_prfill_rwstat(sf, pd, &rwstat);
}
-static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *sf)
+static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
{
- struct blkcg *blkcg = css_to_blkcg(css);
-
- blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
- cft->private, true);
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
+ &blkcg_policy_throtl, seq_cft(sf)->private, true);
return 0;
}
return __blkg_prfill_u64(sf, pd, v);
}
-static int tg_print_conf_u64(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *sf)
+static int tg_print_conf_u64(struct seq_file *sf, void *v)
{
- blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64,
- &blkcg_policy_throtl, cft->private, false);
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
+ &blkcg_policy_throtl, seq_cft(sf)->private, false);
return 0;
}
-static int tg_print_conf_uint(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *sf)
+static int tg_print_conf_uint(struct seq_file *sf, void *v)
{
- blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint,
- &blkcg_policy_throtl, cft->private, false);
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
+ &blkcg_policy_throtl, seq_cft(sf)->private, false);
return 0;
}
{
.name = "throttle.read_bps_device",
.private = offsetof(struct throtl_grp, bps[READ]),
- .read_seq_string = tg_print_conf_u64,
+ .seq_show = tg_print_conf_u64,
.write_string = tg_set_conf_u64,
.max_write_len = 256,
},
{
.name = "throttle.write_bps_device",
.private = offsetof(struct throtl_grp, bps[WRITE]),
- .read_seq_string = tg_print_conf_u64,
+ .seq_show = tg_print_conf_u64,
.write_string = tg_set_conf_u64,
.max_write_len = 256,
},
{
.name = "throttle.read_iops_device",
.private = offsetof(struct throtl_grp, iops[READ]),
- .read_seq_string = tg_print_conf_uint,
+ .seq_show = tg_print_conf_uint,
.write_string = tg_set_conf_uint,
.max_write_len = 256,
},
{
.name = "throttle.write_iops_device",
.private = offsetof(struct throtl_grp, iops[WRITE]),
- .read_seq_string = tg_print_conf_uint,
+ .seq_show = tg_print_conf_uint,
.write_string = tg_set_conf_uint,
.max_write_len = 256,
},
{
.name = "throttle.io_service_bytes",
.private = offsetof(struct tg_stats_cpu, service_bytes),
- .read_seq_string = tg_print_cpu_rwstat,
+ .seq_show = tg_print_cpu_rwstat,
},
{
.name = "throttle.io_serviced",
.private = offsetof(struct tg_stats_cpu, serviced),
- .read_seq_string = tg_print_cpu_rwstat,
+ .seq_show = tg_print_cpu_rwstat,
},
{ } /* terminate */
};
if (tg) {
if (!tg->has_rules[rw]) {
throtl_update_dispatch_stats(tg_to_blkg(tg),
- bio->bi_size, bio->bi_rw);
+ bio->bi_iter.bi_size, bio->bi_rw);
goto out_unlock_rcu;
}
}
/* out-of-limit, queue to @tg */
throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
rw == READ ? 'R' : 'W',
- tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
+ tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw],
tg->io_disp[rw], tg->iops[rw],
sq->nr_queued[READ], sq->nr_queued[WRITE]);
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
+#include <linux/idr.h>
#include "rbd_types.h"
}
#define RBD_DRV_NAME "rbd"
-#define RBD_DRV_NAME_LONG "rbd (rados block device)"
-#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
+#define RBD_MINORS_PER_MAJOR 256
+#define RBD_SINGLE_MAJOR_PART_SHIFT 4
#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
#define RBD_MAX_SNAP_NAME_LEN \
int dev_id; /* blkdev unique id */
int major; /* blkdev assigned major */
+ int minor;
struct gendisk *disk; /* blkdev's gendisk and rq */
u32 image_format; /* Either 1 or 2 */
static struct kmem_cache *rbd_obj_request_cache;
static struct kmem_cache *rbd_segment_name_cache;
+static int rbd_major;
+static DEFINE_IDA(rbd_dev_id_ida);
+
+/*
+ * Default to false for now, as single-major requires >= 0.75 version of
+ * userspace rbd utility.
+ */
+static bool single_major = false;
+module_param(single_major, bool, S_IRUGO);
+MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
+
static int rbd_img_request_submit(struct rbd_img_request *img_request);
static void rbd_dev_device_release(struct device *dev);
size_t count);
static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
size_t count);
+static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
+ size_t count);
+static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
+ size_t count);
static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
static void rbd_spec_put(struct rbd_spec *spec);
+static int rbd_dev_id_to_minor(int dev_id)
+{
+ return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
+}
+
+static int minor_to_rbd_dev_id(int minor)
+{
+ return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
+}
+
static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
+static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
+static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
static struct attribute *rbd_bus_attrs[] = {
&bus_attr_add.attr,
&bus_attr_remove.attr,
+ &bus_attr_add_single_major.attr,
+ &bus_attr_remove_single_major.attr,
NULL,
};
-ATTRIBUTE_GROUPS(rbd_bus);
+
+static umode_t rbd_bus_is_visible(struct kobject *kobj,
+ struct attribute *attr, int index)
+{
+ if (!single_major &&
+ (attr == &bus_attr_add_single_major.attr ||
+ attr == &bus_attr_remove_single_major.attr))
+ return 0;
+
+ return attr->mode;
+}
+
+static const struct attribute_group rbd_bus_group = {
+ .attrs = rbd_bus_attrs,
+ .is_visible = rbd_bus_is_visible,
+};
+__ATTRIBUTE_GROUPS(rbd_bus);
static struct bus_type rbd_bus_type = {
.name = "rbd",
name_format = "%s.%012llx";
if (rbd_dev->image_format == 2)
name_format = "%s.%016llx";
- ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format,
+ ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
rbd_dev->header.object_prefix, segment);
- if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
+ if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
pr_err("error formatting segment name for #%llu (%d)\n",
segment, ret);
kfree(name);
*/
static void zero_bio_chain(struct bio *chain, int start_ofs)
{
- struct bio_vec *bv;
+ struct bio_vec bv;
+ struct bvec_iter iter;
unsigned long flags;
void *buf;
- int i;
int pos = 0;
while (chain) {
- bio_for_each_segment(bv, chain, i) {
- if (pos + bv->bv_len > start_ofs) {
+ bio_for_each_segment(bv, chain, iter) {
+ if (pos + bv.bv_len > start_ofs) {
int remainder = max(start_ofs - pos, 0);
- buf = bvec_kmap_irq(bv, &flags);
+ buf = bvec_kmap_irq(&bv, &flags);
memset(buf + remainder, 0,
- bv->bv_len - remainder);
- flush_dcache_page(bv->bv_page);
+ bv.bv_len - remainder);
+ flush_dcache_page(bv.bv_page);
bvec_kunmap_irq(buf, &flags);
}
- pos += bv->bv_len;
+ pos += bv.bv_len;
}
chain = chain->bi_next;
unsigned int len,
gfp_t gfpmask)
{
- struct bio_vec *bv;
- unsigned int resid;
- unsigned short idx;
- unsigned int voff;
- unsigned short end_idx;
- unsigned short vcnt;
struct bio *bio;
- /* Handle the easy case for the caller */
-
- if (!offset && len == bio_src->bi_size)
- return bio_clone(bio_src, gfpmask);
-
- if (WARN_ON_ONCE(!len))
- return NULL;
- if (WARN_ON_ONCE(len > bio_src->bi_size))
- return NULL;
- if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
- return NULL;
-
- /* Find first affected segment... */
-
- resid = offset;
- bio_for_each_segment(bv, bio_src, idx) {
- if (resid < bv->bv_len)
- break;
- resid -= bv->bv_len;
- }
- voff = resid;
-
- /* ...and the last affected segment */
-
- resid += len;
- __bio_for_each_segment(bv, bio_src, end_idx, idx) {
- if (resid <= bv->bv_len)
- break;
- resid -= bv->bv_len;
- }
- vcnt = end_idx - idx + 1;
-
- /* Build the clone */
-
- bio = bio_alloc(gfpmask, (unsigned int) vcnt);
+ bio = bio_clone(bio_src, gfpmask);
if (!bio)
return NULL; /* ENOMEM */
- bio->bi_bdev = bio_src->bi_bdev;
- bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
- bio->bi_rw = bio_src->bi_rw;
- bio->bi_flags |= 1 << BIO_CLONED;
-
- /*
- * Copy over our part of the bio_vec, then update the first
- * and last (or only) entries.
- */
- memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
- vcnt * sizeof (struct bio_vec));
- bio->bi_io_vec[0].bv_offset += voff;
- if (vcnt > 1) {
- bio->bi_io_vec[0].bv_len -= voff;
- bio->bi_io_vec[vcnt - 1].bv_len = resid;
- } else {
- bio->bi_io_vec[0].bv_len = len;
- }
-
- bio->bi_vcnt = vcnt;
- bio->bi_size = len;
- bio->bi_idx = 0;
+ bio_advance(bio, offset);
+ bio->bi_iter.bi_size = len;
return bio;
}
/* Build up a chain of clone bios up to the limit */
- if (!bi || off >= bi->bi_size || !len)
+ if (!bi || off >= bi->bi_iter.bi_size || !len)
return NULL; /* Nothing to clone */
end = &chain;
rbd_warn(NULL, "bio_chain exhausted with %u left", len);
goto out_err; /* EINVAL; ran out of bio's */
}
- bi_size = min_t(unsigned int, bi->bi_size - off, len);
+ bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
bio = bio_clone_range(bi, off, bi_size, gfpmask);
if (!bio)
goto out_err; /* ENOMEM */
end = &bio->bi_next;
off += bi_size;
- if (off == bi->bi_size) {
+ if (off == bi->bi_iter.bi_size) {
bi = bi->bi_next;
off = 0;
}
osd_req->r_callback = rbd_osd_req_callback;
osd_req->r_priv = obj_request;
- osd_req->r_oid_len = strlen(obj_request->object_name);
- rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
- memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
-
- osd_req->r_file_layout = rbd_dev->layout; /* struct */
+ osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+ ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
return osd_req;
}
osd_req->r_callback = rbd_osd_req_callback;
osd_req->r_priv = obj_request;
- osd_req->r_oid_len = strlen(obj_request->object_name);
- rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
- memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
-
- osd_req->r_file_layout = rbd_dev->layout; /* struct */
+ osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+ ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
return osd_req;
}
if (type == OBJ_REQUEST_BIO) {
bio_list = data_desc;
- rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
+ rbd_assert(img_offset ==
+ bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
} else {
rbd_assert(type == OBJ_REQUEST_PAGES);
pages = data_desc;
* Request sync osd watch/unwatch. The value of "start" determines
* whether a watch request is being initiated or torn down.
*/
-static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
+static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
{
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
struct rbd_obj_request *obj_request;
return ret;
}
+static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
+{
+ return __rbd_dev_header_watch_sync(rbd_dev, true);
+}
+
+static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
+{
+ int ret;
+
+ ret = __rbd_dev_header_watch_sync(rbd_dev, false);
+ if (ret) {
+ rbd_warn(rbd_dev, "unable to tear down watch request: %d\n",
+ ret);
+ }
+}
+
/*
* Synchronous osd object method call. Returns the number of bytes
* returned in the outbound buffer, or a negative error code.
u64 segment_size;
/* create gendisk info */
- disk = alloc_disk(RBD_MINORS_PER_MAJOR);
+ disk = alloc_disk(single_major ?
+ (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
+ RBD_MINORS_PER_MAJOR);
if (!disk)
return -ENOMEM;
snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
rbd_dev->dev_id);
disk->major = rbd_dev->major;
- disk->first_minor = 0;
+ disk->first_minor = rbd_dev->minor;
+ if (single_major)
+ disk->flags |= GENHD_FL_EXT_DEVT;
disk->fops = &rbd_bd_ops;
disk->private_data = rbd_dev;
return sprintf(buf, "%d\n", rbd_dev->major);
return sprintf(buf, "(none)\n");
+}
+static ssize_t rbd_minor_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+ return sprintf(buf, "%d\n", rbd_dev->minor);
}
static ssize_t rbd_client_id_show(struct device *dev,
static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
+static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
&dev_attr_size.attr,
&dev_attr_features.attr,
&dev_attr_major.attr,
+ &dev_attr_minor.attr,
&dev_attr_client_id.attr,
&dev_attr_pool.attr,
&dev_attr_pool_id.attr,
device_unregister(&rbd_dev->dev);
}
-static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
-
/*
* Get a unique rbd identifier for the given new rbd_dev, and add
- * the rbd_dev to the global list. The minimum rbd id is 1.
+ * the rbd_dev to the global list.
*/
-static void rbd_dev_id_get(struct rbd_device *rbd_dev)
+static int rbd_dev_id_get(struct rbd_device *rbd_dev)
{
- rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
+ int new_dev_id;
+
+ new_dev_id = ida_simple_get(&rbd_dev_id_ida,
+ 0, minor_to_rbd_dev_id(1 << MINORBITS),
+ GFP_KERNEL);
+ if (new_dev_id < 0)
+ return new_dev_id;
+
+ rbd_dev->dev_id = new_dev_id;
spin_lock(&rbd_dev_list_lock);
list_add_tail(&rbd_dev->node, &rbd_dev_list);
spin_unlock(&rbd_dev_list_lock);
- dout("rbd_dev %p given dev id %llu\n", rbd_dev,
- (unsigned long long) rbd_dev->dev_id);
+
+ dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id);
+
+ return 0;
}
/*
*/
static void rbd_dev_id_put(struct rbd_device *rbd_dev)
{
- struct list_head *tmp;
- int rbd_id = rbd_dev->dev_id;
- int max_id;
-
- rbd_assert(rbd_id > 0);
-
- dout("rbd_dev %p released dev id %llu\n", rbd_dev,
- (unsigned long long) rbd_dev->dev_id);
spin_lock(&rbd_dev_list_lock);
list_del_init(&rbd_dev->node);
-
- /*
- * If the id being "put" is not the current maximum, there
- * is nothing special we need to do.
- */
- if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
- spin_unlock(&rbd_dev_list_lock);
- return;
- }
-
- /*
- * We need to update the current maximum id. Search the
- * list to find out what it is. We're more likely to find
- * the maximum at the end, so search the list backward.
- */
- max_id = 0;
- list_for_each_prev(tmp, &rbd_dev_list) {
- struct rbd_device *rbd_dev;
-
- rbd_dev = list_entry(tmp, struct rbd_device, node);
- if (rbd_dev->dev_id > max_id)
- max_id = rbd_dev->dev_id;
- }
spin_unlock(&rbd_dev_list_lock);
- /*
- * The max id could have been updated by rbd_dev_id_get(), in
- * which case it now accurately reflects the new maximum.
- * Be careful not to overwrite the maximum value in that
- * case.
- */
- atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
- dout(" max dev id has been reset\n");
+ ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
+
+ dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id);
}
/*
{
int ret;
- /* generate unique id: find highest unique id, add one */
- rbd_dev_id_get(rbd_dev);
+ /* Get an id and fill in device name. */
+
+ ret = rbd_dev_id_get(rbd_dev);
+ if (ret)
+ return ret;
- /* Fill in the device name, now that we have its id. */
BUILD_BUG_ON(DEV_NAME_LEN
< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
- /* Get our block major device number. */
+ /* Record our major and minor device numbers. */
- ret = register_blkdev(0, rbd_dev->name);
- if (ret < 0)
- goto err_out_id;
- rbd_dev->major = ret;
+ if (!single_major) {
+ ret = register_blkdev(0, rbd_dev->name);
+ if (ret < 0)
+ goto err_out_id;
+
+ rbd_dev->major = ret;
+ rbd_dev->minor = 0;
+ } else {
+ rbd_dev->major = rbd_major;
+ rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
+ }
/* Set up the blkdev mapping. */
err_out_disk:
rbd_free_disk(rbd_dev);
err_out_blkdev:
- unregister_blkdev(rbd_dev->major, rbd_dev->name);
+ if (!single_major)
+ unregister_blkdev(rbd_dev->major, rbd_dev->name);
err_out_id:
rbd_dev_id_put(rbd_dev);
rbd_dev_mapping_clear(rbd_dev);
static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
{
int ret;
- int tmp;
/*
* Get the id from the image id object. Unless there's an
goto err_out_format;
if (mapping) {
- ret = rbd_dev_header_watch_sync(rbd_dev, true);
+ ret = rbd_dev_header_watch_sync(rbd_dev);
if (ret)
goto out_header_name;
}
err_out_probe:
rbd_dev_unprobe(rbd_dev);
err_out_watch:
- if (mapping) {
- tmp = rbd_dev_header_watch_sync(rbd_dev, false);
- if (tmp)
- rbd_warn(rbd_dev, "unable to tear down "
- "watch request (%d)\n", tmp);
- }
+ if (mapping)
+ rbd_dev_header_unwatch_sync(rbd_dev);
out_header_name:
kfree(rbd_dev->header_name);
rbd_dev->header_name = NULL;
return ret;
}
-static ssize_t rbd_add(struct bus_type *bus,
- const char *buf,
- size_t count)
+static ssize_t do_rbd_add(struct bus_type *bus,
+ const char *buf,
+ size_t count)
{
struct rbd_device *rbd_dev = NULL;
struct ceph_options *ceph_opts = NULL;
rc = rbd_dev_device_setup(rbd_dev);
if (rc) {
+ /*
+ * rbd_dev_header_unwatch_sync() can't be moved into
+ * rbd_dev_image_release() without refactoring, see
+ * commit 1f3ef78861ac.
+ */
+ rbd_dev_header_unwatch_sync(rbd_dev);
rbd_dev_image_release(rbd_dev);
goto err_out_module;
}
return (ssize_t)rc;
}
+static ssize_t rbd_add(struct bus_type *bus,
+ const char *buf,
+ size_t count)
+{
+ if (single_major)
+ return -EINVAL;
+
+ return do_rbd_add(bus, buf, count);
+}
+
+static ssize_t rbd_add_single_major(struct bus_type *bus,
+ const char *buf,
+ size_t count)
+{
+ return do_rbd_add(bus, buf, count);
+}
+
static void rbd_dev_device_release(struct device *dev)
{
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
rbd_free_disk(rbd_dev);
clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
rbd_dev_mapping_clear(rbd_dev);
- unregister_blkdev(rbd_dev->major, rbd_dev->name);
- rbd_dev->major = 0;
+ if (!single_major)
+ unregister_blkdev(rbd_dev->major, rbd_dev->name);
rbd_dev_id_put(rbd_dev);
rbd_dev_mapping_clear(rbd_dev);
}
}
}
-static ssize_t rbd_remove(struct bus_type *bus,
- const char *buf,
- size_t count)
+static ssize_t do_rbd_remove(struct bus_type *bus,
+ const char *buf,
+ size_t count)
{
struct rbd_device *rbd_dev = NULL;
struct list_head *tmp;
if (ret < 0 || already)
return ret;
- ret = rbd_dev_header_watch_sync(rbd_dev, false);
- if (ret)
- rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
-
+ rbd_dev_header_unwatch_sync(rbd_dev);
/*
* flush remaining watch callbacks - these must be complete
* before the osd_client is shutdown
*/
dout("%s: flushing notifies", __func__);
ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
+
/*
* Don't free anything from rbd_dev->disk until after all
* notifies are completely processed. Otherwise
return count;
}
+static ssize_t rbd_remove(struct bus_type *bus,
+ const char *buf,
+ size_t count)
+{
+ if (single_major)
+ return -EINVAL;
+
+ return do_rbd_remove(bus, buf, count);
+}
+
+static ssize_t rbd_remove_single_major(struct bus_type *bus,
+ const char *buf,
+ size_t count)
+{
+ return do_rbd_remove(bus, buf, count);
+}
+
/*
* create control files in sysfs
* /sys/bus/rbd/...
rbd_assert(!rbd_segment_name_cache);
rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
- MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL);
+ CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
if (rbd_segment_name_cache)
return 0;
out_err:
if (!libceph_compatible(NULL)) {
rbd_warn(NULL, "libceph incompatibility (quitting)");
-
return -EINVAL;
}
+
rc = rbd_slab_init();
if (rc)
return rc;
+
+ if (single_major) {
+ rbd_major = register_blkdev(0, RBD_DRV_NAME);
+ if (rbd_major < 0) {
+ rc = rbd_major;
+ goto err_out_slab;
+ }
+ }
+
rc = rbd_sysfs_init();
if (rc)
- rbd_slab_exit();
+ goto err_out_blkdev;
+
+ if (single_major)
+ pr_info("loaded (major %d)\n", rbd_major);
else
- pr_info("loaded " RBD_DRV_NAME_LONG "\n");
+ pr_info("loaded\n");
+
+ return 0;
+err_out_blkdev:
+ if (single_major)
+ unregister_blkdev(rbd_major, RBD_DRV_NAME);
+err_out_slab:
+ rbd_slab_exit();
return rc;
}
static void __exit rbd_exit(void)
{
rbd_sysfs_cleanup();
+ if (single_major)
+ unregister_blkdev(rbd_major, RBD_DRV_NAME);
rbd_slab_exit();
}
MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
-MODULE_DESCRIPTION("rados block device");
-
/* following authorship retained from original osdblk.c */
MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
+MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
MODULE_LICENSE("GPL");
char *type;
int len;
/* no unplug has been done: do not hook devices != xen vbds */
- if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) {
+ if (xen_has_pv_and_legacy_disk_devices()) {
int major;
if (!VDEV_IS_EXTENDED(vdevice))
for (i = 0; i < pending; i++) {
offset = (i * segs * PAGE_SIZE) >> 9;
size = min((unsigned int)(segs * PAGE_SIZE) >> 9,
- (unsigned int)(bio->bi_size >> 9) - offset);
+ (unsigned int)bio_sectors(bio) - offset);
cloned_bio = bio_clone(bio, GFP_NOIO);
BUG_ON(cloned_bio == NULL);
bio_trim(cloned_bio, offset, size);
if (!xen_domain())
return -ENODEV;
- if (xen_hvm_domain() && !xen_platform_pci_unplug)
+ if (!xen_has_pv_disk_devices())
return -ENODEV;
if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
static void bcachecg_destroy(struct cgroup *cgroup)
{
struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
- free_css_id(&bcache_subsys, &cg->css);
kfree(cg);
}
static void bio_csum(struct bio *bio, struct bkey *k)
{
- struct bio_vec *bv;
+ struct bio_vec bv;
+ struct bvec_iter iter;
uint64_t csum = 0;
- int i;
- bio_for_each_segment(bv, bio, i) {
- void *d = kmap(bv->bv_page) + bv->bv_offset;
- csum = bch_crc64_update(csum, d, bv->bv_len);
- kunmap(bv->bv_page);
+ bio_for_each_segment(bv, bio, iter) {
+ void *d = kmap(bv.bv_page) + bv.bv_offset;
+ csum = bch_crc64_update(csum, d, bv.bv_len);
+ kunmap(bv.bv_page);
}
k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
struct bio *bio = op->bio;
pr_debug("invalidating %i sectors from %llu",
- bio_sectors(bio), (uint64_t) bio->bi_sector);
+ bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector);
while (bio_sectors(bio)) {
unsigned sectors = min(bio_sectors(bio),
if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
goto out;
- bio->bi_sector += sectors;
- bio->bi_size -= sectors << 9;
+ bio->bi_iter.bi_sector += sectors;
+ bio->bi_iter.bi_size -= sectors << 9;
bch_keylist_add(&op->insert_keys,
- &KEY(op->inode, bio->bi_sector, sectors));
+ &KEY(op->inode, bio->bi_iter.bi_sector, sectors));
}
op->insert_data_done = true;
k = op->insert_keys.top;
bkey_init(k);
SET_KEY_INODE(k, op->inode);
- SET_KEY_OFFSET(k, bio->bi_sector);
+ SET_KEY_OFFSET(k, bio->bi_iter.bi_sector);
if (!bch_alloc_sectors(op->c, k, bio_sectors(bio),
op->write_point, op->write_prio,
op->writeback))
goto err;
- n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
+ n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split);
n->bi_end_io = bch_data_insert_endio;
n->bi_private = cl;
(bio->bi_rw & REQ_WRITE)))
goto skip;
- if (bio->bi_sector & (c->sb.block_size - 1) ||
+ if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
bio_sectors(bio) & (c->sb.block_size - 1)) {
pr_debug("skipping unaligned io");
goto skip;
spin_lock(&dc->io_lock);
- hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
- if (i->last == bio->bi_sector &&
+ hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
+ if (i->last == bio->bi_iter.bi_sector &&
time_before(jiffies, i->jiffies))
goto found;
add_sequential(task);
i->sequential = 0;
found:
- if (i->sequential + bio->bi_size > i->sequential)
- i->sequential += bio->bi_size;
+ if (i->sequential + bio->bi_iter.bi_size > i->sequential)
+ i->sequential += bio->bi_iter.bi_size;
i->last = bio_end_sector(bio);
i->jiffies = jiffies + msecs_to_jiffies(5000);
unsigned insert_bio_sectors;
unsigned recoverable:1;
- unsigned unaligned_bvec:1;
unsigned write:1;
unsigned read_dirty_data:1;
struct bkey *bio_key;
unsigned ptr;
- if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_sector, 0)) <= 0)
+ if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0)
return MAP_CONTINUE;
if (KEY_INODE(k) != s->iop.inode ||
- KEY_START(k) > bio->bi_sector) {
+ KEY_START(k) > bio->bi_iter.bi_sector) {
unsigned bio_sectors = bio_sectors(bio);
unsigned sectors = KEY_INODE(k) == s->iop.inode
? min_t(uint64_t, INT_MAX,
- KEY_START(k) - bio->bi_sector)
+ KEY_START(k) - bio->bi_iter.bi_sector)
: INT_MAX;
int ret = s->d->cache_miss(b, s, bio, sectors);
if (KEY_DIRTY(k))
s->read_dirty_data = true;
- n = bch_bio_split(bio, min_t(uint64_t, INT_MAX,
- KEY_OFFSET(k) - bio->bi_sector),
- GFP_NOIO, s->d->bio_split);
+ n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
+ KEY_OFFSET(k) - bio->bi_iter.bi_sector),
+ GFP_NOIO, s->d->bio_split);
bio_key = &container_of(n, struct bbio, bio)->key;
bch_bkey_copy_single_ptr(bio_key, k, ptr);
- bch_cut_front(&KEY(s->iop.inode, n->bi_sector, 0), bio_key);
+ bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key);
bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
n->bi_end_io = bch_cache_read_endio;
struct bio *bio = &s->bio.bio;
int ret = bch_btree_map_keys(&s->op, s->iop.c,
- &KEY(s->iop.inode, bio->bi_sector, 0),
+ &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
cache_lookup_fn, MAP_END_KEY);
if (ret == -EAGAIN)
continue_at(cl, cache_lookup, bcache_wq);
static void do_bio_hook(struct search *s)
{
struct bio *bio = &s->bio.bio;
- memcpy(bio, s->orig_bio, sizeof(struct bio));
+ bio_init(bio);
+ __bio_clone_fast(bio, s->orig_bio);
bio->bi_end_io = request_endio;
bio->bi_private = &s->cl;
+
atomic_set(&bio->bi_cnt, 3);
}
if (s->iop.bio)
bio_put(s->iop.bio);
- if (s->unaligned_bvec)
- mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
-
closure_debug_destroy(cl);
mempool_free(s, s->d->c->search);
}
static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
{
struct search *s;
- struct bio_vec *bv;
s = mempool_alloc(d->c->search, GFP_NOIO);
memset(s, 0, offsetof(struct search, iop.insert_keys));
s->start_time = jiffies;
do_bio_hook(s);
- if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
- bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
- memcpy(bv, bio_iovec(bio),
- sizeof(struct bio_vec) * bio_segments(bio));
-
- s->bio.bio.bi_io_vec = bv;
- s->unaligned_bvec = 1;
- }
-
return s;
}
{
struct search *s = container_of(cl, struct search, cl);
struct bio *bio = &s->bio.bio;
- struct bio_vec *bv;
- int i;
if (s->recoverable) {
/* Retry from the backing device: */
trace_bcache_read_retry(s->orig_bio);
s->iop.error = 0;
- bv = s->bio.bio.bi_io_vec;
do_bio_hook(s);
- s->bio.bio.bi_io_vec = bv;
-
- if (!s->unaligned_bvec)
- bio_for_each_segment(bv, s->orig_bio, i)
- bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
- else
- memcpy(s->bio.bio.bi_io_vec,
- bio_iovec(s->orig_bio),
- sizeof(struct bio_vec) *
- bio_segments(s->orig_bio));
/* XXX: invalidate cache */
if (s->iop.bio) {
bio_reset(s->iop.bio);
- s->iop.bio->bi_sector = s->cache_miss->bi_sector;
+ s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector;
s->iop.bio->bi_bdev = s->cache_miss->bi_bdev;
- s->iop.bio->bi_size = s->insert_bio_sectors << 9;
+ s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
bch_bio_map(s->iop.bio, NULL);
bio_copy_data(s->cache_miss, s->iop.bio);
s->cache_miss = NULL;
}
- if (verify(dc, &s->bio.bio) && s->recoverable &&
- !s->unaligned_bvec && !s->read_dirty_data)
+ if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data)
bch_data_verify(dc, s->orig_bio);
bio_complete(s);
struct bio *miss, *cache_bio;
if (s->cache_miss || s->iop.bypass) {
- miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+ miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
goto out_submit;
}
s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
s->iop.replace_key = KEY(s->iop.inode,
- bio->bi_sector + s->insert_bio_sectors,
+ bio->bi_iter.bi_sector + s->insert_bio_sectors,
s->insert_bio_sectors);
ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key);
s->iop.replace = true;
- miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+ miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
/* btree_search_recurse()'s btree iterator is no good anymore */
ret = miss == bio ? MAP_DONE : -EINTR;
if (!cache_bio)
goto out_submit;
- cache_bio->bi_sector = miss->bi_sector;
- cache_bio->bi_bdev = miss->bi_bdev;
- cache_bio->bi_size = s->insert_bio_sectors << 9;
+ cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector;
+ cache_bio->bi_bdev = miss->bi_bdev;
+ cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
cache_bio->bi_end_io = request_endio;
cache_bio->bi_private = &s->cl;
{
struct closure *cl = &s->cl;
struct bio *bio = &s->bio.bio;
- struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0);
+ struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0);
struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end);
closure_bio_submit(flush, cl, s->d);
}
} else {
- s->iop.bio = bio_clone_bioset(bio, GFP_NOIO,
- dc->disk.bio_split);
+ s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
closure_bio_submit(bio, cl, s->d);
}
part_stat_unlock();
bio->bi_bdev = dc->bdev;
- bio->bi_sector += dc->sb.data_offset;
+ bio->bi_iter.bi_sector += dc->sb.data_offset;
if (cached_dev_get(dc)) {
s = search_alloc(bio, d);
trace_bcache_request_start(s->d, bio);
- if (!bio->bi_size) {
+ if (!bio->bi_iter.bi_size) {
/*
* can't call bch_journal_meta from under
* generic_make_request
static int flash_dev_cache_miss(struct btree *b, struct search *s,
struct bio *bio, unsigned sectors)
{
- struct bio_vec *bv;
- int i;
+ struct bio_vec bv;
+ struct bvec_iter iter;
/* Zero fill bio */
- bio_for_each_segment(bv, bio, i) {
- unsigned j = min(bv->bv_len >> 9, sectors);
+ bio_for_each_segment(bv, bio, iter) {
+ unsigned j = min(bv.bv_len >> 9, sectors);
- void *p = kmap(bv->bv_page);
- memset(p + bv->bv_offset, 0, j << 9);
- kunmap(bv->bv_page);
+ void *p = kmap(bv.bv_page);
+ memset(p + bv.bv_offset, 0, j << 9);
+ kunmap(bv.bv_page);
sectors -= j;
}
- bio_advance(bio, min(sectors << 9, bio->bi_size));
+ bio_advance(bio, min(sectors << 9, bio->bi_iter.bi_size));
- if (!bio->bi_size)
+ if (!bio->bi_iter.bi_size)
return MAP_DONE;
return MAP_CONTINUE;
trace_bcache_request_start(s->d, bio);
- if (!bio->bi_size) {
+ if (!bio->bi_iter.bi_size) {
/*
* can't call bch_journal_meta from under
* generic_make_request
bcache_wq);
} else if (rw) {
bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
- &KEY(d->id, bio->bi_sector, 0),
+ &KEY(d->id, bio->bi_iter.bi_sector, 0),
&KEY(d->id, bio_end_sector(bio), 0));
s->iop.bypass = (bio->bi_rw & REQ_DISCARD) != 0;
struct list_head reserved_buffers;
unsigned need_reserved_buffers;
+ unsigned minimum_buffers;
+
struct hlist_head *cache_hash;
wait_queue_head_t free_buffer_wait;
bio_init(&b->bio);
b->bio.bi_io_vec = b->bio_vec;
b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
- b->bio.bi_sector = block << b->c->sectors_per_block_bits;
+ b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
b->bio.bi_bdev = b->c->bdev;
b->bio.bi_end_io = end_io;
buffers = dm_bufio_cache_size_per_client >>
(c->sectors_per_block_bits + SECTOR_SHIFT);
- if (buffers < DM_BUFIO_MIN_BUFFERS)
- buffers = DM_BUFIO_MIN_BUFFERS;
+ if (buffers < c->minimum_buffers)
+ buffers = c->minimum_buffers;
*limit_buffers = buffers;
*threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
}
EXPORT_SYMBOL_GPL(dm_bufio_release_move);
+/*
+ * Free the given buffer.
+ *
+ * This is just a hint, if the buffer is in use or dirty, this function
+ * does nothing.
+ */
+void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
+{
+ struct dm_buffer *b;
+
+ dm_bufio_lock(c);
+
+ b = __find(c, block);
+ if (b && likely(!b->hold_count) && likely(!b->state)) {
+ __unlink_buffer(b);
+ __free_buffer_wake(b);
+ }
+
+ dm_bufio_unlock(c);
+}
+EXPORT_SYMBOL(dm_bufio_forget);
+
+void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
+{
+ c->minimum_buffers = n;
+}
+EXPORT_SYMBOL(dm_bufio_set_minimum_buffers);
+
unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
{
return c->block_size;
INIT_LIST_HEAD(&c->reserved_buffers);
c->need_reserved_buffers = reserved_buffers;
+ c->minimum_buffers = DM_BUFIO_MIN_BUFFERS;
+
init_waitqueue_head(&c->free_buffer_wait);
c->async_write_error = 0;
static void iot_update_stats(struct io_tracker *t, struct bio *bio)
{
- if (bio->bi_sector == from_oblock(t->last_end_oblock) + 1)
+ if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1)
t->nr_seq_samples++;
else {
/*
t->nr_rand_samples++;
}
- t->last_end_oblock = to_oblock(bio->bi_sector + bio_sectors(bio) - 1);
+ t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1);
}
static void iot_check_for_pattern_switch(struct io_tracker *t)
static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
{
struct entry *e = ep->entries + from_cblock(cblock);
- list_del(&e->list);
- INIT_LIST_HEAD(&e->list);
+ list_del_init(&e->list);
INIT_HLIST_NODE(&e->hlist);
ep->nr_allocated++;
*/
unsigned promote_threshold;
+ unsigned discard_promote_adjustment;
+ unsigned read_promote_adjustment;
+ unsigned write_promote_adjustment;
+
/*
* The hash table allows us to quickly find an entry by origin
* block. Both pre_cache and cache entries are in here.
struct hlist_head *table;
};
+#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
+#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
+#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
+
/*----------------------------------------------------------------*/
/*
* We bias towards reads, since they can be demoted at no cost if they
* haven't been dirtied.
*/
-#define DISCARDED_PROMOTE_THRESHOLD 1
-#define READ_PROMOTE_THRESHOLD 4
-#define WRITE_PROMOTE_THRESHOLD 8
-
static unsigned adjusted_promote_threshold(struct mq_policy *mq,
bool discarded_oblock, int data_dir)
{
if (data_dir == READ)
- return mq->promote_threshold + READ_PROMOTE_THRESHOLD;
+ return mq->promote_threshold + mq->read_promote_adjustment;
if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
/*
* We don't need to do any copying at all, so give this a
* very low threshold.
*/
- return DISCARDED_PROMOTE_THRESHOLD;
+ return mq->discard_promote_adjustment;
}
- return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD;
+ return mq->promote_threshold + mq->write_promote_adjustment;
}
static bool should_promote(struct mq_policy *mq, struct entry *e,
bool can_migrate, bool discarded_oblock,
int data_dir, struct policy_result *result)
{
- if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) {
+ if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
if (can_migrate)
insert_in_cache(mq, oblock, result);
else
const char *key, const char *value)
{
struct mq_policy *mq = to_mq_policy(p);
- enum io_pattern pattern;
unsigned long tmp;
- if (!strcasecmp(key, "random_threshold"))
- pattern = PATTERN_RANDOM;
- else if (!strcasecmp(key, "sequential_threshold"))
- pattern = PATTERN_SEQUENTIAL;
- else
- return -EINVAL;
-
if (kstrtoul(value, 10, &tmp))
return -EINVAL;
- mq->tracker.thresholds[pattern] = tmp;
+ if (!strcasecmp(key, "random_threshold")) {
+ mq->tracker.thresholds[PATTERN_RANDOM] = tmp;
+
+ } else if (!strcasecmp(key, "sequential_threshold")) {
+ mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp;
+
+ } else if (!strcasecmp(key, "discard_promote_adjustment"))
+ mq->discard_promote_adjustment = tmp;
+
+ else if (!strcasecmp(key, "read_promote_adjustment"))
+ mq->read_promote_adjustment = tmp;
+
+ else if (!strcasecmp(key, "write_promote_adjustment"))
+ mq->write_promote_adjustment = tmp;
+
+ else
+ return -EINVAL;
return 0;
}
ssize_t sz = 0;
struct mq_policy *mq = to_mq_policy(p);
- DMEMIT("4 random_threshold %u sequential_threshold %u",
+ DMEMIT("10 random_threshold %u "
+ "sequential_threshold %u "
+ "discard_promote_adjustment %u "
+ "read_promote_adjustment %u "
+ "write_promote_adjustment %u",
mq->tracker.thresholds[PATTERN_RANDOM],
- mq->tracker.thresholds[PATTERN_SEQUENTIAL]);
+ mq->tracker.thresholds[PATTERN_SEQUENTIAL],
+ mq->discard_promote_adjustment,
+ mq->read_promote_adjustment,
+ mq->write_promote_adjustment);
return 0;
}
mq->hit_count = 0;
mq->generation = 0;
mq->promote_threshold = 0;
+ mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
+ mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
+ mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
mutex_init(&mq->lock);
spin_lock_init(&mq->tick_lock);
static struct dm_cache_policy_type mq_policy_type = {
.name = "mq",
- .version = {1, 1, 0},
+ .version = {1, 2, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = mq_create
static struct dm_cache_policy_type default_policy_type = {
.name = "default",
- .version = {1, 1, 0},
+ .version = {1, 2, 0},
.hint_size = 4,
.owner = THIS_MODULE,
- .create = mq_create
+ .create = mq_create,
+ .real = &mq_policy_type
};
static int __init mq_init(void)
{
bio->bi_end_io = h->bi_end_io;
bio->bi_private = h->bi_private;
+
+ /*
+ * Must bump bi_remaining to allow bio to complete with
+ * restored bi_end_io.
+ */
+ atomic_inc(&bio->bi_remaining);
}
/*----------------------------------------------------------------*/
static void remap_to_cache(struct cache *cache, struct bio *bio,
dm_cblock_t cblock)
{
- sector_t bi_sector = bio->bi_sector;
+ sector_t bi_sector = bio->bi_iter.bi_sector;
bio->bi_bdev = cache->cache_dev->bdev;
if (!block_size_is_power_of_two(cache))
- bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
- sector_div(bi_sector, cache->sectors_per_block);
+ bio->bi_iter.bi_sector =
+ (from_cblock(cblock) * cache->sectors_per_block) +
+ sector_div(bi_sector, cache->sectors_per_block);
else
- bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
- (bi_sector & (cache->sectors_per_block - 1));
+ bio->bi_iter.bi_sector =
+ (from_cblock(cblock) << cache->sectors_per_block_shift) |
+ (bi_sector & (cache->sectors_per_block - 1));
}
static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
{
- sector_t block_nr = bio->bi_sector;
+ sector_t block_nr = bio->bi_iter.bi_sector;
if (!block_size_is_power_of_two(cache))
(void) sector_div(block_nr, cache->sectors_per_block);
static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
{
return (bio_data_dir(bio) == WRITE) &&
- (bio->bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
+ (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
}
static void avoid_copy(struct dm_cache_migration *mg)
size_t pb_data_size = get_per_bio_data_size(cache);
struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
- BUG_ON(bio->bi_size);
+ BUG_ON(bio->bi_iter.bi_size);
if (!pb->req_nr)
remap_to_origin(cache, bio);
else
*/
static void process_discard_bio(struct cache *cache, struct bio *bio)
{
- dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
+ dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector,
cache->discard_block_size);
- dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
+ dm_block_t end_block = bio_end_sector(bio);
dm_block_t b;
end_block = block_div(end_block, cache->discard_block_size);
/*
* Status format:
*
- * <#used metadata blocks>/<#total metadata blocks>
+ * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
+ * <cache block size> <#used cache blocks>/<#total cache blocks>
* <#read hits> <#read misses> <#write hits> <#write misses>
- * <#demotions> <#promotions> <#blocks in cache> <#dirty>
+ * <#demotions> <#promotions> <#dirty>
* <#features> <features>*
* <#core args> <core args>
- * <#policy args> <policy args>*
+ * <policy name> <#policy args> <policy args>*
*/
static void cache_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
residency = policy_residency(cache->policy);
- DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
+ DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %llu ",
+ (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
(unsigned long long)nr_blocks_metadata,
+ cache->sectors_per_block,
+ (unsigned long long) from_cblock(residency),
+ (unsigned long long) from_cblock(cache->cache_size),
(unsigned) atomic_read(&cache->stats.read_hit),
(unsigned) atomic_read(&cache->stats.read_miss),
(unsigned) atomic_read(&cache->stats.write_hit),
(unsigned) atomic_read(&cache->stats.write_miss),
(unsigned) atomic_read(&cache->stats.demotion),
(unsigned) atomic_read(&cache->stats.promotion),
- (unsigned long long) from_cblock(residency),
- cache->nr_dirty);
+ (unsigned long long) from_cblock(cache->nr_dirty));
if (writethrough_mode(&cache->features))
DMEMIT("1 writethrough ");
}
DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
+
+ DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
if (sz < maxlen) {
r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
if (r)
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 2, 0},
+ .version = {1, 3, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
struct work_struct flush_expired_bios;
struct list_head delayed_bios;
atomic_t may_delay;
- mempool_t *delayed_pool;
struct dm_dev *dev_read;
sector_t start_read;
struct dm_delay_info {
struct delay_c *context;
struct list_head list;
- struct bio *bio;
unsigned long expires;
};
static DEFINE_MUTEX(delayed_bios_lock);
-static struct kmem_cache *delayed_cache;
-
static void handle_delayed_timer(unsigned long data)
{
struct delay_c *dc = (struct delay_c *)data;
mutex_lock(&delayed_bios_lock);
list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
if (flush_all || time_after_eq(jiffies, delayed->expires)) {
+ struct bio *bio = dm_bio_from_per_bio_data(delayed,
+ sizeof(struct dm_delay_info));
list_del(&delayed->list);
- bio_list_add(&flush_bios, delayed->bio);
- if ((bio_data_dir(delayed->bio) == WRITE))
+ bio_list_add(&flush_bios, bio);
+ if ((bio_data_dir(bio) == WRITE))
delayed->context->writes--;
else
delayed->context->reads--;
- mempool_free(delayed, dc->delayed_pool);
continue;
}
}
out:
- dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache);
- if (!dc->delayed_pool) {
- DMERR("Couldn't create delayed bio pool.");
- goto bad_dev_write;
- }
-
dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
if (!dc->kdelayd_wq) {
DMERR("Couldn't start kdelayd");
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
+ ti->per_bio_data_size = sizeof(struct dm_delay_info);
ti->private = dc;
return 0;
bad_queue:
- mempool_destroy(dc->delayed_pool);
-bad_dev_write:
if (dc->dev_write)
dm_put_device(ti, dc->dev_write);
bad_dev_read:
if (dc->dev_write)
dm_put_device(ti, dc->dev_write);
- mempool_destroy(dc->delayed_pool);
kfree(dc);
}
if (!delay || !atomic_read(&dc->may_delay))
return 1;
- delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO);
+ delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
delayed->context = dc;
- delayed->bio = bio;
delayed->expires = expires = jiffies + (delay * HZ / 1000);
mutex_lock(&delayed_bios_lock);
if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
bio->bi_bdev = dc->dev_write->bdev;
if (bio_sectors(bio))
- bio->bi_sector = dc->start_write +
- dm_target_offset(ti, bio->bi_sector);
+ bio->bi_iter.bi_sector = dc->start_write +
+ dm_target_offset(ti, bio->bi_iter.bi_sector);
return delay_bio(dc, dc->write_delay, bio);
}
bio->bi_bdev = dc->dev_read->bdev;
- bio->bi_sector = dc->start_read + dm_target_offset(ti, bio->bi_sector);
+ bio->bi_iter.bi_sector = dc->start_read +
+ dm_target_offset(ti, bio->bi_iter.bi_sector);
return delay_bio(dc, dc->read_delay, bio);
}
static int __init dm_delay_init(void)
{
- int r = -ENOMEM;
-
- delayed_cache = KMEM_CACHE(dm_delay_info, 0);
- if (!delayed_cache) {
- DMERR("Couldn't create delayed bio cache.");
- goto bad_memcache;
- }
+ int r;
r = dm_register_target(&delay_target);
if (r < 0) {
return 0;
bad_register:
- kmem_cache_destroy(delayed_cache);
-bad_memcache:
return r;
}
static void __exit dm_delay_exit(void)
{
dm_unregister_target(&delay_target);
- kmem_cache_destroy(delayed_cache);
}
/* Module hooks */
return NULL;
}
-static struct dm_exception *alloc_completed_exception(void)
+static struct dm_exception *alloc_completed_exception(gfp_t gfp)
{
struct dm_exception *e;
- e = kmem_cache_alloc(exception_cache, GFP_NOIO);
- if (!e)
+ e = kmem_cache_alloc(exception_cache, gfp);
+ if (!e && gfp == GFP_NOIO)
e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
return e;
struct dm_snapshot *s = context;
struct dm_exception *e;
- e = alloc_completed_exception();
+ e = alloc_completed_exception(GFP_KERNEL);
if (!e)
return -ENOMEM;
goto out;
}
- e = alloc_completed_exception();
+ e = alloc_completed_exception(GFP_NOIO);
if (!e) {
down_write(&s->lock);
__invalidate_snapshot(s, -ENOMEM);
if (full_bio) {
full_bio->bi_end_io = pe->full_bio_end_io;
full_bio->bi_private = pe->full_bio_private;
+ atomic_inc(&full_bio->bi_remaining);
}
free_pending_exception(pe);
struct bio *bio, chunk_t chunk)
{
bio->bi_bdev = s->cow->bdev;
- bio->bi_sector = chunk_to_sector(s->store,
- dm_chunk_number(e->new_chunk) +
- (chunk - e->old_chunk)) +
- (bio->bi_sector &
- s->store->chunk_mask);
+ bio->bi_iter.bi_sector =
+ chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) +
+ (chunk - e->old_chunk)) +
+ (bio->bi_iter.bi_sector & s->store->chunk_mask);
}
static int snapshot_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
- chunk = sector_to_chunk(s->store, bio->bi_sector);
+ chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
/* Full snapshots are not usable */
/* To get here the table must be live so s->active is always set. */
r = DM_MAPIO_SUBMITTED;
if (!pe->started &&
- bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) {
+ bio->bi_iter.bi_size ==
+ (s->store->chunk_size << SECTOR_SHIFT)) {
pe->started = 1;
up_write(&s->lock);
start_full_bio(pe, bio);
return DM_MAPIO_REMAPPED;
}
- chunk = sector_to_chunk(s->store, bio->bi_sector);
+ chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
down_write(&s->lock);
down_read(&_origins_lock);
o = __lookup_origin(origin->bdev);
if (o)
- r = __origin_write(&o->snapshots, bio->bi_sector, bio);
+ r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
up_read(&_origins_lock);
return r;
bool zero_new_blocks:1;
bool discard_enabled:1;
bool discard_passdown:1;
+ bool error_if_no_space:1;
};
struct thin_c;
int sectors_per_block_shift;
struct pool_features pf;
- unsigned low_water_triggered:1; /* A dm event has been sent */
- unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
+ bool low_water_triggered:1; /* A dm event has been sent */
struct dm_bio_prison *prison;
struct dm_kcopyd_client *copier;
};
static enum pool_mode get_pool_mode(struct pool *pool);
-static void set_pool_mode(struct pool *pool, enum pool_mode mode);
+static void out_of_data_space(struct pool *pool);
+static void metadata_operation_failed(struct pool *pool, const char *op, int r);
/*
* Target context for a pool.
static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
{
struct pool *pool = tc->pool;
- sector_t block_nr = bio->bi_sector;
+ sector_t block_nr = bio->bi_iter.bi_sector;
if (block_size_is_power_of_two(pool))
block_nr >>= pool->sectors_per_block_shift;
static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
{
struct pool *pool = tc->pool;
- sector_t bi_sector = bio->bi_sector;
+ sector_t bi_sector = bio->bi_iter.bi_sector;
bio->bi_bdev = tc->pool_dev->bdev;
if (block_size_is_power_of_two(pool))
- bio->bi_sector = (block << pool->sectors_per_block_shift) |
- (bi_sector & (pool->sectors_per_block - 1));
+ bio->bi_iter.bi_sector =
+ (block << pool->sectors_per_block_shift) |
+ (bi_sector & (pool->sectors_per_block - 1));
else
- bio->bi_sector = (block * pool->sectors_per_block) +
+ bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
sector_div(bi_sector, pool->sectors_per_block);
}
struct dm_thin_new_mapping {
struct list_head list;
- unsigned quiesced:1;
- unsigned prepared:1;
- unsigned pass_discard:1;
+ bool quiesced:1;
+ bool prepared:1;
+ bool pass_discard:1;
+ bool definitely_not_shared:1;
+ int err;
struct thin_c *tc;
dm_block_t virt_block;
dm_block_t data_block;
struct dm_bio_prison_cell *cell, *cell2;
- int err;
/*
* If the bio covers the whole area of a block then we can avoid
struct pool *pool = m->tc->pool;
if (m->quiesced && m->prepared) {
- list_add(&m->list, &pool->prepared_mappings);
+ list_add_tail(&m->list, &pool->prepared_mappings);
wake_worker(pool);
}
}
m->err = read_err || write_err ? -EIO : 0;
spin_lock_irqsave(&pool->lock, flags);
- m->prepared = 1;
+ m->prepared = true;
__maybe_add_mapping(m);
spin_unlock_irqrestore(&pool->lock, flags);
}
m->err = err;
spin_lock_irqsave(&pool->lock, flags);
- m->prepared = 1;
+ m->prepared = true;
__maybe_add_mapping(m);
spin_unlock_irqrestore(&pool->lock, flags);
}
static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
{
- if (m->bio)
+ if (m->bio) {
m->bio->bi_end_io = m->saved_bi_end_io;
+ atomic_inc(&m->bio->bi_remaining);
+ }
cell_error(m->tc->pool, m->cell);
list_del(&m->list);
mempool_free(m, m->tc->pool->mapping_pool);
int r;
bio = m->bio;
- if (bio)
+ if (bio) {
bio->bi_end_io = m->saved_bi_end_io;
+ atomic_inc(&bio->bi_remaining);
+ }
if (m->err) {
cell_error(pool, m->cell);
*/
r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
if (r) {
- DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d",
- dm_device_name(pool->pool_md), r);
- set_pool_mode(pool, PM_READ_ONLY);
+ metadata_operation_failed(pool, "dm_thin_insert_block", r);
cell_error(pool, m->cell);
goto out;
}
cell_defer_no_holder(tc, m->cell2);
if (m->pass_discard)
- remap_and_issue(tc, m->bio, m->data_block);
+ if (m->definitely_not_shared)
+ remap_and_issue(tc, m->bio, m->data_block);
+ else {
+ bool used = false;
+ if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
+ bio_endio(m->bio, 0);
+ else
+ remap_and_issue(tc, m->bio, m->data_block);
+ }
else
bio_endio(m->bio, 0);
*/
static int io_overlaps_block(struct pool *pool, struct bio *bio)
{
- return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
+ return bio->bi_iter.bi_size ==
+ (pool->sectors_per_block << SECTOR_SHIFT);
}
static int io_overwrites_block(struct pool *pool, struct bio *bio)
static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
{
- struct dm_thin_new_mapping *r = pool->next_mapping;
+ struct dm_thin_new_mapping *m = pool->next_mapping;
BUG_ON(!pool->next_mapping);
+ memset(m, 0, sizeof(struct dm_thin_new_mapping));
+ INIT_LIST_HEAD(&m->list);
+ m->bio = NULL;
+
pool->next_mapping = NULL;
- return r;
+ return m;
}
static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
struct pool *pool = tc->pool;
struct dm_thin_new_mapping *m = get_next_mapping(pool);
- INIT_LIST_HEAD(&m->list);
- m->quiesced = 0;
- m->prepared = 0;
m->tc = tc;
m->virt_block = virt_block;
m->data_block = data_dest;
m->cell = cell;
- m->err = 0;
- m->bio = NULL;
if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
- m->quiesced = 1;
+ m->quiesced = true;
/*
* IO to pool_dev remaps to the pool target's data_dev.
struct pool *pool = tc->pool;
struct dm_thin_new_mapping *m = get_next_mapping(pool);
- INIT_LIST_HEAD(&m->list);
- m->quiesced = 1;
- m->prepared = 0;
+ m->quiesced = true;
+ m->prepared = false;
m->tc = tc;
m->virt_block = virt_block;
m->data_block = data_block;
m->cell = cell;
- m->err = 0;
- m->bio = NULL;
/*
* If the whole block of data is being overwritten or we are not
return -EINVAL;
r = dm_pool_commit_metadata(pool->pmd);
- if (r) {
- DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d",
- dm_device_name(pool->pool_md), r);
- set_pool_mode(pool, PM_READ_ONLY);
- }
+ if (r)
+ metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
return r;
}
-static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
+static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
{
- int r;
- dm_block_t free_blocks;
unsigned long flags;
- struct pool *pool = tc->pool;
-
- /*
- * Once no_free_space is set we must not allow allocation to succeed.
- * Otherwise it is difficult to explain, debug, test and support.
- */
- if (pool->no_free_space)
- return -ENOSPC;
-
- r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
- if (r)
- return r;
if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
DMWARN("%s: reached low water mark for data device: sending event.",
dm_device_name(pool->pool_md));
spin_lock_irqsave(&pool->lock, flags);
- pool->low_water_triggered = 1;
+ pool->low_water_triggered = true;
spin_unlock_irqrestore(&pool->lock, flags);
dm_table_event(pool->ti->table);
}
+}
+
+static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
+{
+ int r;
+ dm_block_t free_blocks;
+ struct pool *pool = tc->pool;
+
+ if (get_pool_mode(pool) != PM_WRITE)
+ return -EINVAL;
+
+ r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
+ if (r) {
+ metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
+ return r;
+ }
+
+ check_low_water_mark(pool, free_blocks);
if (!free_blocks) {
/*
return r;
r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
- if (r)
+ if (r) {
+ metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
return r;
+ }
- /*
- * If we still have no space we set a flag to avoid
- * doing all this checking and return -ENOSPC. This
- * flag serves as a latch that disallows allocations from
- * this pool until the admin takes action (e.g. resize or
- * table reload).
- */
if (!free_blocks) {
- DMWARN("%s: no free data space available.",
- dm_device_name(pool->pool_md));
- spin_lock_irqsave(&pool->lock, flags);
- pool->no_free_space = 1;
- spin_unlock_irqrestore(&pool->lock, flags);
+ out_of_data_space(pool);
return -ENOSPC;
}
}
r = dm_pool_alloc_data_block(pool->pmd, result);
if (r) {
- if (r == -ENOSPC &&
- !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
- !free_blocks) {
- DMWARN("%s: no free metadata space available.",
- dm_device_name(pool->pool_md));
- set_pool_mode(pool, PM_READ_ONLY);
- }
+ metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
return r;
}
spin_unlock_irqrestore(&pool->lock, flags);
}
-static void no_space(struct pool *pool, struct dm_bio_prison_cell *cell)
+static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
+{
+ /*
+ * When pool is read-only, no cell locking is needed because
+ * nothing is changing.
+ */
+ WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY);
+
+ if (pool->pf.error_if_no_space)
+ bio_io_error(bio);
+ else
+ retry_on_resume(bio);
+}
+
+static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
{
struct bio *bio;
struct bio_list bios;
cell_release(pool, cell, &bios);
while ((bio = bio_list_pop(&bios)))
- retry_on_resume(bio);
+ handle_unserviceable_bio(pool, bio);
}
static void process_discard(struct thin_c *tc, struct bio *bio)
*/
m = get_next_mapping(pool);
m->tc = tc;
- m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
+ m->pass_discard = pool->pf.discard_passdown;
+ m->definitely_not_shared = !lookup_result.shared;
m->virt_block = block;
m->data_block = lookup_result.block;
m->cell = cell;
m->cell2 = cell2;
- m->err = 0;
m->bio = bio;
if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
spin_lock_irqsave(&pool->lock, flags);
- list_add(&m->list, &pool->prepared_discards);
+ list_add_tail(&m->list, &pool->prepared_discards);
spin_unlock_irqrestore(&pool->lock, flags);
wake_worker(pool);
}
break;
case -ENOSPC:
- no_space(pool, cell);
+ retry_bios_on_resume(pool, cell);
break;
default:
DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
__func__, r);
- set_pool_mode(pool, PM_READ_ONLY);
cell_error(pool, cell);
break;
}
if (bio_detain(pool, &key, bio, &cell))
return;
- if (bio_data_dir(bio) == WRITE && bio->bi_size)
+ if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size)
break_sharing(tc, bio, block, &key, lookup_result, cell);
else {
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
/*
* Remap empty bios (flushes) immediately, without provisioning.
*/
- if (!bio->bi_size) {
+ if (!bio->bi_iter.bi_size) {
inc_all_io_entry(pool, bio);
cell_defer_no_holder(tc, cell);
break;
case -ENOSPC:
- no_space(pool, cell);
+ retry_bios_on_resume(pool, cell);
break;
default:
DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
__func__, r);
- set_pool_mode(pool, PM_READ_ONLY);
cell_error(pool, cell);
break;
}
r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
switch (r) {
case 0:
- if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
+ if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size)
- bio_io_error(bio);
+ handle_unserviceable_bio(tc->pool, bio);
else {
inc_all_io_entry(tc->pool, bio);
remap_and_issue(tc, bio, lookup_result.block);
case -ENODATA:
if (rw != READ) {
- bio_io_error(bio);
+ handle_unserviceable_bio(tc->pool, bio);
break;
}
return pool->pf.mode;
}
-static void set_pool_mode(struct pool *pool, enum pool_mode mode)
+static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
{
int r;
+ enum pool_mode old_mode = pool->pf.mode;
- pool->pf.mode = mode;
-
- switch (mode) {
+ switch (new_mode) {
case PM_FAIL:
- DMERR("%s: switching pool to failure mode",
- dm_device_name(pool->pool_md));
+ if (old_mode != new_mode)
+ DMERR("%s: switching pool to failure mode",
+ dm_device_name(pool->pool_md));
dm_pool_metadata_read_only(pool->pmd);
pool->process_bio = process_bio_fail;
pool->process_discard = process_bio_fail;
break;
case PM_READ_ONLY:
- DMERR("%s: switching pool to read-only mode",
- dm_device_name(pool->pool_md));
+ if (old_mode != new_mode)
+ DMERR("%s: switching pool to read-only mode",
+ dm_device_name(pool->pool_md));
r = dm_pool_abort_metadata(pool->pmd);
if (r) {
DMERR("%s: aborting transaction failed",
dm_device_name(pool->pool_md));
- set_pool_mode(pool, PM_FAIL);
+ new_mode = PM_FAIL;
+ set_pool_mode(pool, new_mode);
} else {
dm_pool_metadata_read_only(pool->pmd);
pool->process_bio = process_bio_read_only;
break;
case PM_WRITE:
+ if (old_mode != new_mode)
+ DMINFO("%s: switching pool to write mode",
+ dm_device_name(pool->pool_md));
dm_pool_metadata_read_write(pool->pmd);
pool->process_bio = process_bio;
pool->process_discard = process_discard;
pool->process_prepared_discard = process_prepared_discard;
break;
}
+
+ pool->pf.mode = new_mode;
+}
+
+/*
+ * Rather than calling set_pool_mode directly, use these which describe the
+ * reason for mode degradation.
+ */
+static void out_of_data_space(struct pool *pool)
+{
+ DMERR_LIMIT("%s: no free data space available.",
+ dm_device_name(pool->pool_md));
+ set_pool_mode(pool, PM_READ_ONLY);
+}
+
+static void metadata_operation_failed(struct pool *pool, const char *op, int r)
+{
+ dm_block_t free_blocks;
+
+ DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
+ dm_device_name(pool->pool_md), op, r);
+
+ if (r == -ENOSPC &&
+ !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
+ !free_blocks)
+ DMERR_LIMIT("%s: no free metadata space available.",
+ dm_device_name(pool->pool_md));
+
+ set_pool_mode(pool, PM_READ_ONLY);
}
/*----------------------------------------------------------------*/
if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
/*
* This block isn't provisioned, and we have no way
- * of doing so. Just error it.
+ * of doing so.
*/
- bio_io_error(bio);
+ handle_unserviceable_bio(tc->pool, bio);
return DM_MAPIO_SUBMITTED;
}
/* fall through */
enum pool_mode old_mode = pool->pf.mode;
enum pool_mode new_mode = pt->adjusted_pf.mode;
+ /*
+ * Don't change the pool's mode until set_pool_mode() below.
+ * Otherwise the pool's process_* function pointers may
+ * not match the desired pool mode.
+ */
+ pt->adjusted_pf.mode = old_mode;
+
+ pool->ti = ti;
+ pool->pf = pt->adjusted_pf;
+ pool->low_water_blocks = pt->low_water_blocks;
+
/*
* If we were in PM_FAIL mode, rollback of metadata failed. We're
* not going to recover without a thin_repair. So we never let the
if (old_mode == PM_FAIL)
new_mode = old_mode;
- pool->ti = ti;
- pool->low_water_blocks = pt->low_water_blocks;
- pool->pf = pt->adjusted_pf;
-
set_pool_mode(pool, new_mode);
return 0;
pf->zero_new_blocks = true;
pf->discard_enabled = true;
pf->discard_passdown = true;
+ pf->error_if_no_space = false;
}
static void __pool_destroy(struct pool *pool)
bio_list_init(&pool->deferred_flush_bios);
INIT_LIST_HEAD(&pool->prepared_mappings);
INIT_LIST_HEAD(&pool->prepared_discards);
- pool->low_water_triggered = 0;
- pool->no_free_space = 0;
+ pool->low_water_triggered = false;
bio_list_init(&pool->retry_on_resume_list);
pool->shared_read_ds = dm_deferred_set_create();
const char *arg_name;
static struct dm_arg _args[] = {
- {0, 3, "Invalid number of pool feature arguments"},
+ {0, 4, "Invalid number of pool feature arguments"},
};
/*
else if (!strcasecmp(arg_name, "read_only"))
pf->mode = PM_READ_ONLY;
+ else if (!strcasecmp(arg_name, "error_if_no_space"))
+ pf->error_if_no_space = true;
+
else {
ti->error = "Unrecognised pool feature requested";
r = -EINVAL;
* skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
* ignore_discard: disable discard
* no_discard_passdown: don't pass discards down to the data device
+ * read_only: Don't allow any changes to be made to the pool metadata.
+ * error_if_no_space: error IOs, instead of queueing, if no space.
*/
static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
return -EINVAL;
} else if (data_size > sb_data_size) {
+ if (sb_data_size)
+ DMINFO("%s: growing the data device from %llu to %llu blocks",
+ dm_device_name(pool->pool_md),
+ sb_data_size, (unsigned long long)data_size);
r = dm_pool_resize_data_dev(pool->pmd, data_size);
if (r) {
- DMERR("%s: failed to resize data device",
- dm_device_name(pool->pool_md));
- set_pool_mode(pool, PM_READ_ONLY);
+ metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
return r;
}
return -EINVAL;
} else if (metadata_dev_size > sb_metadata_dev_size) {
+ DMINFO("%s: growing the metadata device from %llu to %llu blocks",
+ dm_device_name(pool->pool_md),
+ sb_metadata_dev_size, metadata_dev_size);
r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
if (r) {
- DMERR("%s: failed to resize metadata device",
- dm_device_name(pool->pool_md));
+ metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
return r;
}
unsigned long flags;
spin_lock_irqsave(&pool->lock, flags);
- pool->low_water_triggered = 0;
- pool->no_free_space = 0;
+ pool->low_water_triggered = false;
__requeue_bios(pool);
spin_unlock_irqrestore(&pool->lock, flags);
unsigned sz, unsigned maxlen)
{
unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
- !pf->discard_passdown + (pf->mode == PM_READ_ONLY);
+ !pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
+ pf->error_if_no_space;
DMEMIT("%u ", count);
if (!pf->zero_new_blocks)
if (pf->mode == PM_READ_ONLY)
DMEMIT("read_only ");
+
+ if (pf->error_if_no_space)
+ DMEMIT("error_if_no_space ");
}
/*
DMEMIT("rw ");
if (!pool->pf.discard_enabled)
- DMEMIT("ignore_discard");
+ DMEMIT("ignore_discard ");
else if (pool->pf.discard_passdown)
- DMEMIT("discard_passdown");
+ DMEMIT("discard_passdown ");
+ else
+ DMEMIT("no_discard_passdown ");
+
+ if (pool->pf.error_if_no_space)
+ DMEMIT("error_if_no_space ");
else
- DMEMIT("no_discard_passdown");
+ DMEMIT("queue_if_no_space ");
break;
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 9, 0},
+ .version = {1, 10, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
static int thin_map(struct dm_target *ti, struct bio *bio)
{
- bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
+ bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
return thin_bio_map(ti, bio);
}
spin_lock_irqsave(&pool->lock, flags);
list_for_each_entry_safe(m, tmp, &work, list) {
list_del(&m->list);
- m->quiesced = 1;
+ m->quiesced = true;
__maybe_add_mapping(m);
}
spin_unlock_irqrestore(&pool->lock, flags);
if (!list_empty(&work)) {
spin_lock_irqsave(&pool->lock, flags);
list_for_each_entry_safe(m, tmp, &work, list)
- list_add(&m->list, &pool->prepared_discards);
+ list_add_tail(&m->list, &pool->prepared_discards);
spin_unlock_irqrestore(&pool->lock, flags);
wake_worker(pool);
}
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 9, 0},
+ .version = {1, 10, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
/* forced geometry settings */
struct hd_geometry geometry;
- /* sysfs handle */
- struct kobject kobj;
+ /* kobject and completion */
+ struct dm_kobject_holder kobj_holder;
/* zero-length flush that will be cloned and submitted to targets */
struct bio flush_bio;
atomic_inc_return(&md->pending[rw]));
if (unlikely(dm_stats_used(&md->stats)))
- dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
+ dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
bio_sectors(bio), false, 0, &io->stats_aux);
}
part_stat_unlock();
if (unlikely(dm_stats_used(&md->stats)))
- dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
+ dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
bio_sectors(bio), true, duration, &io->stats_aux);
/*
if (io_error == DM_ENDIO_REQUEUE)
return;
- if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
+ if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) {
/*
* Preflush done for flush with data, reissue
* without REQ_FLUSH.
struct dm_rq_clone_bio_info *info = clone->bi_private;
struct dm_rq_target_io *tio = info->tio;
struct bio *bio = info->orig;
- unsigned int nr_bytes = info->orig->bi_size;
+ unsigned int nr_bytes = info->orig->bi_iter.bi_size;
bio_put(clone);
* this io.
*/
atomic_inc(&tio->io->io_count);
- sector = clone->bi_sector;
+ sector = clone->bi_iter.bi_sector;
r = ti->type->map(ti, clone);
if (r == DM_MAPIO_REMAPPED) {
/* the bio has been remapped so dispatch it */
struct dm_io *io;
sector_t sector;
sector_t sector_count;
- unsigned short idx;
};
static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
{
- bio->bi_sector = sector;
- bio->bi_size = to_bytes(len);
- }
-
- static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count)
- {
- bio->bi_idx = idx;
- bio->bi_vcnt = idx + bv_count;
- bio->bi_flags &= ~(1 << BIO_SEG_VALID);
- }
-
- static void clone_bio_integrity(struct bio *bio, struct bio *clone,
- unsigned short idx, unsigned len, unsigned offset,
- unsigned trim)
- {
- if (!bio_integrity(bio))
- return;
-
- bio_integrity_clone(clone, bio, GFP_NOIO);
-
- if (trim)
- bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len);
- }
-
- /*
- * Creates a little bio that just does part of a bvec.
- */
- static void clone_split_bio(struct dm_target_io *tio, struct bio *bio,
- sector_t sector, unsigned short idx,
- unsigned offset, unsigned len)
- {
- struct bio *clone = &tio->clone;
- struct bio_vec *bv = bio->bi_io_vec + idx;
-
- *clone->bi_io_vec = *bv;
-
- bio_setup_sector(clone, sector, len);
-
- clone->bi_bdev = bio->bi_bdev;
- clone->bi_rw = bio->bi_rw;
- clone->bi_vcnt = 1;
- clone->bi_io_vec->bv_offset = offset;
- clone->bi_io_vec->bv_len = clone->bi_size;
- clone->bi_flags |= 1 << BIO_CLONED;
-
- clone_bio_integrity(bio, clone, idx, len, offset, 1);
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_iter.bi_size = to_bytes(len);
}
/*
* Creates a bio that consists of range of complete bvecs.
*/
static void clone_bio(struct dm_target_io *tio, struct bio *bio,
- sector_t sector, unsigned short idx,
- unsigned short bv_count, unsigned len)
+ sector_t sector, unsigned len)
{
struct bio *clone = &tio->clone;
- unsigned trim = 0;
- __bio_clone(clone, bio);
- bio_setup_sector(clone, sector, len);
- bio_setup_bv(clone, idx, bv_count);
+ __bio_clone_fast(clone, bio);
+
+ if (bio_integrity(bio))
+ bio_integrity_clone(clone, bio, GFP_NOIO);
- if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
- trim = 1;
- clone_bio_integrity(bio, clone, idx, len, 0, trim);
+ bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
+ clone->bi_iter.bi_size = to_bytes(len);
+
+ if (bio_integrity(bio))
+ bio_integrity_trim(clone, 0, len);
}
static struct dm_target_io *alloc_tio(struct clone_info *ci,
* ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
* and discard, so no need for concern about wasted bvec allocations.
*/
- __bio_clone(clone, ci->bio);
+ __bio_clone_fast(clone, ci->bio);
if (len)
bio_setup_sector(clone, ci->sector, len);
}
static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
- sector_t sector, int nr_iovecs,
- unsigned short idx, unsigned short bv_count,
- unsigned offset, unsigned len,
- unsigned split_bvec)
+ sector_t sector, unsigned len)
{
struct bio *bio = ci->bio;
struct dm_target_io *tio;
num_target_bios = ti->num_write_bios(ti, bio);
for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
- tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr);
- if (split_bvec)
- clone_split_bio(tio, bio, sector, idx, offset, len);
- else
- clone_bio(tio, bio, sector, idx, bv_count, len);
+ tio = alloc_tio(ci, ti, 0, target_bio_nr);
+ clone_bio(tio, bio, sector, len);
__map_bio(tio);
}
}
return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
}
- /*
- * Find maximum number of sectors / bvecs we can process with a single bio.
- */
- static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx)
- {
- struct bio *bio = ci->bio;
- sector_t bv_len, total_len = 0;
-
- for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) {
- bv_len = to_sector(bio->bi_io_vec[*idx].bv_len);
-
- if (bv_len > max)
- break;
-
- max -= bv_len;
- total_len += bv_len;
- }
-
- return total_len;
- }
-
- static int __split_bvec_across_targets(struct clone_info *ci,
- struct dm_target *ti, sector_t max)
- {
- struct bio *bio = ci->bio;
- struct bio_vec *bv = bio->bi_io_vec + ci->idx;
- sector_t remaining = to_sector(bv->bv_len);
- unsigned offset = 0;
- sector_t len;
-
- do {
- if (offset) {
- ti = dm_table_find_target(ci->map, ci->sector);
- if (!dm_target_is_valid(ti))
- return -EIO;
-
- max = max_io_len(ci->sector, ti);
- }
-
- len = min(remaining, max);
-
- __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0,
- bv->bv_offset + offset, len, 1);
-
- ci->sector += len;
- ci->sector_count -= len;
- offset += to_bytes(len);
- } while (remaining -= len);
-
- ci->idx++;
-
- return 0;
- }
-
/*
* Select the correct strategy for processing a non-flush bio.
*/
{
struct bio *bio = ci->bio;
struct dm_target *ti;
- sector_t len, max;
- int idx;
+ unsigned len;
if (unlikely(bio->bi_rw & REQ_DISCARD))
return __send_discard(ci);
if (!dm_target_is_valid(ti))
return -EIO;
- max = max_io_len(ci->sector, ti);
-
- /*
- * Optimise for the simple case where we can do all of
- * the remaining io with a single clone.
- */
- if (ci->sector_count <= max) {
- __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
- ci->idx, bio->bi_vcnt - ci->idx, 0,
- ci->sector_count, 0);
- ci->sector_count = 0;
- return 0;
- }
+ len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
- /*
- * There are some bvecs that don't span targets.
- * Do as many of these as possible.
- */
- if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
- len = __len_within_target(ci, max, &idx);
+ __clone_and_map_data_bio(ci, ti, ci->sector, len);
- __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
- ci->idx, idx - ci->idx, 0, len, 0);
+ ci->sector += len;
+ ci->sector_count -= len;
- ci->sector += len;
- ci->sector_count -= len;
- ci->idx = idx;
-
- return 0;
- }
-
- /*
- * Handle a bvec that must be split between two or more targets.
- */
- return __split_bvec_across_targets(ci, ti, max);
+ return 0;
}
/*
ci.io->bio = bio;
ci.io->md = md;
spin_lock_init(&ci.io->endio_lock);
- ci.sector = bio->bi_sector;
- ci.idx = bio->bi_idx;
+ ci.sector = bio->bi_iter.bi_sector;
start_io_acct(ci.io);
init_waitqueue_head(&md->wait);
INIT_WORK(&md->work, dm_wq_work);
init_waitqueue_head(&md->eventq);
+ init_completion(&md->kobj_holder.completion);
md->disk->major = _major;
md->disk->first_minor = minor;
struct kobject *dm_kobject(struct mapped_device *md)
{
- return &md->kobj;
+ return &md->kobj_holder.kobj;
}
-/*
- * struct mapped_device should not be exported outside of dm.c
- * so use this check to verify that kobj is part of md structure
- */
struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
{
struct mapped_device *md;
- md = container_of(kobj, struct mapped_device, kobj);
- if (&md->kobj != kobj)
- return NULL;
+ md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
if (test_bit(DMF_FREEING, &md->flags) ||
dm_deleting_md(md))
struct mddev *mddev = container_of(ws, struct mddev, flush_work);
struct bio *bio = mddev->flush_bio;
- if (bio->bi_size == 0)
+ if (bio->bi_iter.bi_size == 0)
/* an empty barrier - all done */
bio_endio(bio, 0);
else {
struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
- bio->bi_sector = sector;
+ bio->bi_iter.bi_sector = sector;
bio_add_page(bio, page, size, 0);
bio->bi_private = rdev;
bio->bi_end_io = super_written;
struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
int ret;
- rw |= REQ_SYNC;
-
bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
rdev->meta_bdev : rdev->bdev;
if (metadata_op)
- bio->bi_sector = sector + rdev->sb_start;
+ bio->bi_iter.bi_sector = sector + rdev->sb_start;
else if (rdev->mddev->reshape_position != MaxSector &&
(rdev->mddev->reshape_backwards ==
(sector >= rdev->mddev->reshape_position)))
- bio->bi_sector = sector + rdev->new_data_offset;
+ bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
else
- bio->bi_sector = sector + rdev->data_offset;
+ bio->bi_iter.bi_sector = sector + rdev->data_offset;
bio_add_page(bio, page, size, 0);
submit_bio_wait(rw, bio);
rdev->raid_disk = -1;
clear_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
+ clear_bit(Bitmap_sync, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags);
if (mddev->raid_disks == 0) {
*/
if (ev1 < mddev->bitmap->events_cleared)
return 0;
+ if (ev1 < mddev->events)
+ set_bit(Bitmap_sync, &rdev->flags);
} else {
if (ev1 < mddev->events)
/* just a hot-add of a new device, leave raid_disk at -1 */
desc->raid_disk < mddev->raid_disks */) {
set_bit(In_sync, &rdev->flags);
rdev->raid_disk = desc->raid_disk;
+ rdev->saved_raid_disk = desc->raid_disk;
} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
/* active but not in sync implies recovery up to
* reshape position. We don't know exactly where
rdev->raid_disk = -1;
clear_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
+ clear_bit(Bitmap_sync, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags);
if (mddev->raid_disks == 0) {
*/
if (ev1 < mddev->bitmap->events_cleared)
return 0;
+ if (ev1 < mddev->events)
+ set_bit(Bitmap_sync, &rdev->flags);
} else {
if (ev1 < mddev->events)
/* just a hot-add of a new device, leave raid_disk at -1 */
set_bit(Faulty, &rdev->flags);
break;
default:
+ rdev->saved_raid_disk = role;
if ((le32_to_cpu(sb->feature_map) &
- MD_FEATURE_RECOVERY_OFFSET))
+ MD_FEATURE_RECOVERY_OFFSET)) {
rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
- else
+ if (!(le32_to_cpu(sb->feature_map) &
+ MD_FEATURE_RECOVERY_BITMAP))
+ rdev->saved_raid_disk = -1;
+ } else
set_bit(In_sync, &rdev->flags);
rdev->raid_disk = role;
break;
cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
sb->recovery_offset =
cpu_to_le64(rdev->recovery_offset);
+ if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
+ sb->feature_map |=
+ cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
}
if (test_bit(Replacement, &rdev->flags))
sb->feature_map |=
if (rdev->sb_loaded != 1)
continue; /* no noise on spare devices */
- if (!test_bit(Faulty, &rdev->flags) &&
- rdev->saved_raid_disk == -1) {
+ if (!test_bit(Faulty, &rdev->flags)) {
md_super_write(mddev,rdev,
rdev->sb_start, rdev->sb_size,
rdev->sb_page);
rdev->badblocks.size = 0;
}
- } else if (test_bit(Faulty, &rdev->flags))
+ } else
pr_debug("md: %s (skipping faulty)\n",
bdevname(rdev->bdev, b));
- else
- pr_debug("(skipping incremental s/r ");
if (mddev->level == LEVEL_MULTIPATH)
/* only need to write one superblock... */
* blocked - sets the Blocked flags
* -blocked - clears the Blocked and possibly simulates an error
* insync - sets Insync providing device isn't active
+ * -insync - clear Insync for a device with a slot assigned,
+ * so that it gets rebuilt based on bitmap
* write_error - sets WriteErrorSeen
* -write_error - clears WriteErrorSeen
*/
} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
set_bit(In_sync, &rdev->flags);
err = 0;
+ } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
+ clear_bit(In_sync, &rdev->flags);
+ rdev->saved_raid_disk = rdev->raid_disk;
+ rdev->raid_disk = -1;
+ err = 0;
} else if (cmd_match(buf, "write_error")) {
set_bit(WriteErrorSeen, &rdev->flags);
err = 0;
else
rdev->saved_raid_disk = -1;
clear_bit(In_sync, &rdev->flags);
+ clear_bit(Bitmap_sync, &rdev->flags);
err = rdev->mddev->pers->
hot_add_disk(rdev->mddev, rdev);
if (err) {
pers->run(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
mddev_resume(mddev);
+ if (!mddev->thread)
+ md_update_sb(mddev, 1);
sysfs_notify(&mddev->kobj, NULL, "level");
md_new_event(mddev);
return rv;
info->raid_disk < mddev->raid_disks) {
rdev->raid_disk = info->raid_disk;
set_bit(In_sync, &rdev->flags);
+ clear_bit(Bitmap_sync, &rdev->flags);
} else
rdev->raid_disk = -1;
+ rdev->saved_raid_disk = rdev->raid_disk;
} else
super_types[mddev->major_version].
validate_super(mddev, rdev);
return -EINVAL;
}
- if (test_bit(In_sync, &rdev->flags))
- rdev->saved_raid_disk = rdev->raid_disk;
- else
- rdev->saved_raid_disk = -1;
-
clear_bit(In_sync, &rdev->flags); /* just to be sure */
if (info->state & (1<<MD_DISK_WRITEMOSTLY))
set_bit(WriteMostly, &rdev->flags);
return 0;
}
+static inline bool md_ioctl_valid(unsigned int cmd)
+{
+ switch (cmd) {
+ case ADD_NEW_DISK:
+ case BLKROSET:
+ case GET_ARRAY_INFO:
+ case GET_BITMAP_FILE:
+ case GET_DISK_INFO:
+ case HOT_ADD_DISK:
+ case HOT_REMOVE_DISK:
+ case PRINT_RAID_DEBUG:
+ case RAID_AUTORUN:
+ case RAID_VERSION:
+ case RESTART_ARRAY_RW:
+ case RUN_ARRAY:
+ case SET_ARRAY_INFO:
+ case SET_BITMAP_FILE:
+ case SET_DISK_FAULTY:
+ case STOP_ARRAY:
+ case STOP_ARRAY_RO:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int md_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct mddev *mddev = NULL;
int ro;
+ if (!md_ioctl_valid(cmd))
+ return -ENOTTY;
+
switch (cmd) {
case RAID_VERSION:
case GET_ARRAY_INFO:
if (test_bit(Faulty, &rdev->flags))
continue;
if (mddev->ro &&
- rdev->saved_raid_disk < 0)
+ ! (rdev->saved_raid_disk >= 0 &&
+ !test_bit(Bitmap_sync, &rdev->flags)))
continue;
- rdev->recovery_offset = 0;
+ if (rdev->saved_raid_disk < 0)
+ rdev->recovery_offset = 0;
if (mddev->pers->
hot_add_disk(mddev, rdev) == 0) {
if (sysfs_link_rdev(mddev, rdev))
* As we only add devices that are already in-sync,
* we can activate the spares immediately.
*/
- clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
remove_and_add_spares(mddev, NULL);
- mddev->pers->spare_active(mddev);
+ /* There is no thread, but we need to call
+ * ->spare_active and clear saved_raid_disk
+ */
+ md_reap_sync_thread(mddev);
+ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
goto unlock;
}
mddev->pers->finish_reshape(mddev);
/* If array is no-longer degraded, then any saved_raid_disk
- * information must be scrapped. Also if any device is now
- * In_sync we must scrape the saved_raid_disk for that device
- * do the superblock for an incrementally recovered device
- * written out.
+ * information must be scrapped.
*/
- rdev_for_each(rdev, mddev)
- if (!mddev->degraded ||
- test_bit(In_sync, &rdev->flags))
+ if (!mddev->degraded)
+ rdev_for_each(rdev, mddev)
rdev->saved_raid_disk = -1;
md_update_sb(mddev, 1);
int done;
struct r1conf *conf = r1_bio->mddev->private;
sector_t start_next_window = r1_bio->start_next_window;
- sector_t bi_sector = bio->bi_sector;
+ sector_t bi_sector = bio->bi_iter.bi_sector;
if (bio->bi_phys_segments) {
unsigned long flags;
if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
(bio_data_dir(bio) == WRITE) ? "write" : "read",
- (unsigned long long) bio->bi_sector,
- (unsigned long long) bio->bi_sector +
- bio_sectors(bio) - 1);
+ (unsigned long long) bio->bi_iter.bi_sector,
+ (unsigned long long) bio_end_sector(bio) - 1);
call_bio_endio(r1_bio);
}
struct bio *mbio = r1_bio->master_bio;
pr_debug("raid1: behind end write sectors"
" %llu-%llu\n",
- (unsigned long long) mbio->bi_sector,
- (unsigned long long) mbio->bi_sector +
- bio_sectors(mbio) - 1);
+ (unsigned long long) mbio->bi_iter.bi_sector,
+ (unsigned long long) bio_end_sector(mbio) - 1);
call_bio_endio(r1_bio);
}
}
else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
>= bio_end_sector(bio)) ||
(conf->next_resync + NEXT_NORMALIO_DISTANCE
- <= bio->bi_sector))
+ <= bio->bi_iter.bi_sector))
wait = false;
else
wait = true;
if (bio && bio_data_dir(bio) == WRITE) {
if (conf->next_resync + NEXT_NORMALIO_DISTANCE
- <= bio->bi_sector) {
+ <= bio->bi_iter.bi_sector) {
if (conf->start_next_window == MaxSector)
conf->start_next_window =
conf->next_resync +
NEXT_NORMALIO_DISTANCE;
if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
- <= bio->bi_sector)
+ <= bio->bi_iter.bi_sector)
conf->next_window_requests++;
else
conf->current_window_requests++;
- }
- if (bio->bi_iter.bi_sector >= conf->start_next_window)
sector = conf->start_next_window;
+ }
}
conf->nr_pending++;
if (bvecs[i].bv_page)
put_page(bvecs[i].bv_page);
kfree(bvecs);
- pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
+ pr_debug("%dB behind alloc failed, doing sync I/O\n",
+ bio->bi_iter.bi_size);
}
struct raid1_plug_cb {
if (bio_data_dir(bio) == WRITE &&
bio_end_sector(bio) > mddev->suspend_lo &&
- bio->bi_sector < mddev->suspend_hi) {
+ bio->bi_iter.bi_sector < mddev->suspend_hi) {
/* As the suspend_* range is controlled by
* userspace, we want an interruptible
* wait.
prepare_to_wait(&conf->wait_barrier,
&w, TASK_INTERRUPTIBLE);
if (bio_end_sector(bio) <= mddev->suspend_lo ||
- bio->bi_sector >= mddev->suspend_hi)
+ bio->bi_iter.bi_sector >= mddev->suspend_hi)
break;
schedule();
}
r1_bio->sectors = bio_sectors(bio);
r1_bio->state = 0;
r1_bio->mddev = mddev;
- r1_bio->sector = bio->bi_sector;
+ r1_bio->sector = bio->bi_iter.bi_sector;
/* We might need to issue multiple reads to different
* devices if there are bad blocks around, so we keep
r1_bio->read_disk = rdisk;
read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- bio_trim(read_bio, r1_bio->sector - bio->bi_sector,
+ bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
max_sectors);
r1_bio->bios[rdisk] = read_bio;
- read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
+ read_bio->bi_iter.bi_sector = r1_bio->sector +
+ mirror->rdev->data_offset;
read_bio->bi_bdev = mirror->rdev->bdev;
read_bio->bi_end_io = raid1_end_read_request;
read_bio->bi_rw = READ | do_sync;
*/
sectors_handled = (r1_bio->sector + max_sectors
- - bio->bi_sector);
+ - bio->bi_iter.bi_sector);
r1_bio->sectors = max_sectors;
spin_lock_irq(&conf->device_lock);
if (bio->bi_phys_segments == 0)
r1_bio->sectors = bio_sectors(bio) - sectors_handled;
r1_bio->state = 0;
r1_bio->mddev = mddev;
- r1_bio->sector = bio->bi_sector + sectors_handled;
+ r1_bio->sector = bio->bi_iter.bi_sector +
+ sectors_handled;
goto read_again;
} else
generic_make_request(read_bio);
if (r1_bio->bios[j])
rdev_dec_pending(conf->mirrors[j].rdev, mddev);
r1_bio->state = 0;
- allow_barrier(conf, start_next_window, bio->bi_sector);
+ allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector);
md_wait_for_blocked_rdev(blocked_rdev, mddev);
start_next_window = wait_barrier(conf, bio);
/*
bio->bi_phys_segments++;
spin_unlock_irq(&conf->device_lock);
}
- sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
+ sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0);
continue;
mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- bio_trim(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
+ bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors);
if (first_clone) {
/* do behind I/O ?
r1_bio->bios[i] = mbio;
- mbio->bi_sector = (r1_bio->sector +
+ mbio->bi_iter.bi_sector = (r1_bio->sector +
conf->mirrors[i].rdev->data_offset);
mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_end_io = raid1_end_write_request;
r1_bio->sectors = bio_sectors(bio) - sectors_handled;
r1_bio->state = 0;
r1_bio->mddev = mddev;
- r1_bio->sector = bio->bi_sector + sectors_handled;
+ r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
goto retry_write;
}
/* fixup the bio for reuse */
bio_reset(b);
b->bi_vcnt = vcnt;
- b->bi_size = r1_bio->sectors << 9;
- b->bi_sector = r1_bio->sector +
+ b->bi_iter.bi_size = r1_bio->sectors << 9;
+ b->bi_iter.bi_sector = r1_bio->sector +
conf->mirrors[i].rdev->data_offset;
b->bi_bdev = conf->mirrors[i].rdev->bdev;
b->bi_end_io = end_sync_read;
b->bi_private = r1_bio;
- size = b->bi_size;
+ size = b->bi_iter.bi_size;
for (j = 0; j < vcnt ; j++) {
struct bio_vec *bi;
bi = &b->bi_io_vec[j];
}
wbio->bi_rw = WRITE;
- wbio->bi_sector = r1_bio->sector;
- wbio->bi_size = r1_bio->sectors << 9;
+ wbio->bi_iter.bi_sector = r1_bio->sector;
+ wbio->bi_iter.bi_size = r1_bio->sectors << 9;
bio_trim(wbio, sector - r1_bio->sector, sectors);
- wbio->bi_sector += rdev->data_offset;
+ wbio->bi_iter.bi_sector += rdev->data_offset;
wbio->bi_bdev = rdev->bdev;
if (submit_bio_wait(WRITE, wbio) == 0)
/* failure! */
}
r1_bio->read_disk = disk;
bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
- bio_trim(bio, r1_bio->sector - bio->bi_sector, max_sectors);
+ bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
+ max_sectors);
r1_bio->bios[r1_bio->read_disk] = bio;
rdev = conf->mirrors[disk].rdev;
printk_ratelimited(KERN_ERR
mdname(mddev),
(unsigned long long)r1_bio->sector,
bdevname(rdev->bdev, b));
- bio->bi_sector = r1_bio->sector + rdev->data_offset;
+ bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset;
bio->bi_bdev = rdev->bdev;
bio->bi_end_io = raid1_end_read_request;
bio->bi_rw = READ | do_sync;
/* Drat - have to split this up more */
struct bio *mbio = r1_bio->master_bio;
int sectors_handled = (r1_bio->sector + max_sectors
- - mbio->bi_sector);
+ - mbio->bi_iter.bi_sector);
r1_bio->sectors = max_sectors;
spin_lock_irq(&conf->device_lock);
if (mbio->bi_phys_segments == 0)
r1_bio->state = 0;
set_bit(R1BIO_ReadError, &r1_bio->state);
r1_bio->mddev = mddev;
- r1_bio->sector = mbio->bi_sector + sectors_handled;
+ r1_bio->sector = mbio->bi_iter.bi_sector +
+ sectors_handled;
goto read_more;
} else
}
if (bio->bi_end_io) {
atomic_inc(&rdev->nr_pending);
- bio->bi_sector = sector_nr + rdev->data_offset;
+ bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
bio->bi_bdev = rdev->bdev;
bio->bi_private = r1_bio;
}
continue;
/* remove last page from this bio */
bio->bi_vcnt--;
- bio->bi_size -= len;
+ bio->bi_iter.bi_size -= len;
bio->bi_flags &= ~(1<< BIO_SEG_VALID);
}
goto bio_full;
kfree(plug);
}
- static void make_request(struct mddev *mddev, struct bio * bio)
+ static void __make_request(struct mddev *mddev, struct bio *bio)
{
struct r10conf *conf = mddev->private;
struct r10bio *r10_bio;
struct bio *read_bio;
int i;
- sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
- int chunk_sects = chunk_mask + 1;
const int rw = bio_data_dir(bio);
const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
int max_sectors;
int sectors;
- if (unlikely(bio->bi_rw & REQ_FLUSH)) {
- md_flush_request(mddev, bio);
- return;
- }
-
- /* If this request crosses a chunk boundary, we need to
- * split it. This will only happen for 1 PAGE (or less) requests.
- */
- if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
- > chunk_sects
- && (conf->geo.near_copies < conf->geo.raid_disks
- || conf->prev.near_copies < conf->prev.raid_disks))) {
- struct bio_pair *bp;
- /* Sanity check -- queue functions should prevent this happening */
- if (bio_segments(bio) > 1)
- goto bad_map;
- /* This is a one page bio that upper layers
- * refuse to split for us, so we need to split it.
- */
- bp = bio_split(bio,
- chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
-
- /* Each of these 'make_request' calls will call 'wait_barrier'.
- * If the first succeeds but the second blocks due to the resync
- * thread raising the barrier, we will deadlock because the
- * IO to the underlying device will be queued in generic_make_request
- * and will never complete, so will never reduce nr_pending.
- * So increment nr_waiting here so no new raise_barriers will
- * succeed, and so the second wait_barrier cannot block.
- */
- spin_lock_irq(&conf->resync_lock);
- conf->nr_waiting++;
- spin_unlock_irq(&conf->resync_lock);
-
- make_request(mddev, &bp->bio1);
- make_request(mddev, &bp->bio2);
-
- spin_lock_irq(&conf->resync_lock);
- conf->nr_waiting--;
- wake_up(&conf->wait_barrier);
- spin_unlock_irq(&conf->resync_lock);
-
- bio_pair_release(bp);
- return;
- bad_map:
- printk("md/raid10:%s: make_request bug: can't convert block across chunks"
- " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
- (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
-
- bio_io_error(bio);
- return;
- }
-
- md_write_start(mddev, bio);
-
- /*
- * Register the new request and wait if the reconstruction
- * thread has put up a bar for new requests.
- * Continue immediately if no resync is active currently.
- */
- wait_barrier(conf);
-
sectors = bio_sectors(bio);
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- bio->bi_sector < conf->reshape_progress &&
- bio->bi_sector + sectors > conf->reshape_progress) {
+ bio->bi_iter.bi_sector < conf->reshape_progress &&
+ bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
/* IO spans the reshape position. Need to wait for
* reshape to pass
*/
allow_barrier(conf);
wait_event(conf->wait_barrier,
- conf->reshape_progress <= bio->bi_sector ||
- conf->reshape_progress >= bio->bi_sector + sectors);
+ conf->reshape_progress <= bio->bi_iter.bi_sector ||
+ conf->reshape_progress >= bio->bi_iter.bi_sector +
+ sectors);
wait_barrier(conf);
}
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio_data_dir(bio) == WRITE &&
(mddev->reshape_backwards
- ? (bio->bi_sector < conf->reshape_safe &&
- bio->bi_sector + sectors > conf->reshape_progress)
- : (bio->bi_sector + sectors > conf->reshape_safe &&
- bio->bi_sector < conf->reshape_progress))) {
+ ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
+ bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
+ : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
+ bio->bi_iter.bi_sector < conf->reshape_progress))) {
/* Need to update reshape_position in metadata */
mddev->reshape_position = conf->reshape_progress;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
r10_bio->sectors = sectors;
r10_bio->mddev = mddev;
- r10_bio->sector = bio->bi_sector;
+ r10_bio->sector = bio->bi_iter.bi_sector;
r10_bio->state = 0;
/* We might need to issue multiple reads to different
slot = r10_bio->read_slot;
read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- bio_trim(read_bio, r10_bio->sector - bio->bi_sector,
+ bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
max_sectors);
r10_bio->devs[slot].bio = read_bio;
r10_bio->devs[slot].rdev = rdev;
- read_bio->bi_sector = r10_bio->devs[slot].addr +
+ read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
choose_data_offset(r10_bio, rdev);
read_bio->bi_bdev = rdev->bdev;
read_bio->bi_end_io = raid10_end_read_request;
/* Could not read all from this device, so we will
* need another r10_bio.
*/
- sectors_handled = (r10_bio->sectors + max_sectors
+ sectors_handled = (r10_bio->sector + max_sectors
- - bio->bi_sector);
+ - bio->bi_iter.bi_sector);
r10_bio->sectors = max_sectors;
spin_lock_irq(&conf->device_lock);
if (bio->bi_phys_segments == 0)
bio->bi_phys_segments = 2;
else
bio->bi_phys_segments++;
- spin_unlock(&conf->device_lock);
+ spin_unlock_irq(&conf->device_lock);
/* Cannot call generic_make_request directly
* as that will be queued in __generic_make_request
* and subsequent mempool_alloc might block
r10_bio->sectors = bio_sectors(bio) - sectors_handled;
r10_bio->state = 0;
r10_bio->mddev = mddev;
- r10_bio->sector = bio->bi_sector + sectors_handled;
+ r10_bio->sector = bio->bi_iter.bi_sector +
+ sectors_handled;
goto read_again;
} else
generic_make_request(read_bio);
bio->bi_phys_segments++;
spin_unlock_irq(&conf->device_lock);
}
- sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
+ sectors_handled = r10_bio->sector + max_sectors -
+ bio->bi_iter.bi_sector;
atomic_set(&r10_bio->remaining, 1);
bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
if (r10_bio->devs[i].bio) {
struct md_rdev *rdev = conf->mirrors[d].rdev;
mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- bio_trim(mbio, r10_bio->sector - bio->bi_sector,
+ bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
max_sectors);
r10_bio->devs[i].bio = mbio;
- mbio->bi_sector = (r10_bio->devs[i].addr+
+ mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
choose_data_offset(r10_bio,
rdev));
mbio->bi_bdev = rdev->bdev;
rdev = conf->mirrors[d].rdev;
}
mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- bio_trim(mbio, r10_bio->sector - bio->bi_sector,
+ bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
max_sectors);
r10_bio->devs[i].repl_bio = mbio;
- mbio->bi_sector = (r10_bio->devs[i].addr +
+ mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr +
choose_data_offset(
r10_bio, rdev));
mbio->bi_bdev = rdev->bdev;
r10_bio->sectors = bio_sectors(bio) - sectors_handled;
r10_bio->mddev = mddev;
- r10_bio->sector = bio->bi_sector + sectors_handled;
+ r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
r10_bio->state = 0;
goto retry_write;
}
one_write_done(r10_bio);
+ }
+
+ static void make_request(struct mddev *mddev, struct bio *bio)
+ {
+ struct r10conf *conf = mddev->private;
+ sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
+ int chunk_sects = chunk_mask + 1;
+
+ struct bio *split;
+
+ if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+ md_flush_request(mddev, bio);
+ return;
+ }
+
+ md_write_start(mddev, bio);
+
+ /*
+ * Register the new request and wait if the reconstruction
+ * thread has put up a bar for new requests.
+ * Continue immediately if no resync is active currently.
+ */
+ wait_barrier(conf);
+
+ do {
+
+ /*
+ * If this request crosses a chunk boundary, we need to split
+ * it.
+ */
+ if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
+ bio_sectors(bio) > chunk_sects
+ && (conf->geo.near_copies < conf->geo.raid_disks
+ || conf->prev.near_copies <
+ conf->prev.raid_disks))) {
+ split = bio_split(bio, chunk_sects -
+ (bio->bi_iter.bi_sector &
+ (chunk_sects - 1)),
+ GFP_NOIO, fs_bio_set);
+ bio_chain(split, bio);
+ } else {
+ split = bio;
+ }
+
+ __make_request(mddev, split);
+ } while (split != bio);
/* In case raid10d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
bio_reset(tbio);
tbio->bi_vcnt = vcnt;
- tbio->bi_size = r10_bio->sectors << 9;
+ tbio->bi_iter.bi_size = r10_bio->sectors << 9;
tbio->bi_rw = WRITE;
tbio->bi_private = r10_bio;
- tbio->bi_sector = r10_bio->devs[i].addr;
+ tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
for (j=0; j < vcnt ; j++) {
tbio->bi_io_vec[j].bv_offset = 0;
atomic_inc(&r10_bio->remaining);
md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
- tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
+ tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
generic_make_request(tbio);
}
sectors = sect_to_write;
/* Write at 'sector' for 'sectors' */
wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- bio_trim(wbio, sector - bio->bi_sector, sectors);
- wbio->bi_sector = (r10_bio->devs[i].addr+
+ bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
+ wbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
choose_data_offset(r10_bio, rdev) +
(sector - r10_bio->sector));
wbio->bi_bdev = rdev->bdev;
(unsigned long long)r10_bio->sector);
bio = bio_clone_mddev(r10_bio->master_bio,
GFP_NOIO, mddev);
- bio_trim(bio, r10_bio->sector - bio->bi_sector, max_sectors);
+ bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
r10_bio->devs[slot].bio = bio;
r10_bio->devs[slot].rdev = rdev;
- bio->bi_sector = r10_bio->devs[slot].addr
+ bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
+ choose_data_offset(r10_bio, rdev);
bio->bi_bdev = rdev->bdev;
bio->bi_rw = READ | do_sync;
struct bio *mbio = r10_bio->master_bio;
int sectors_handled =
r10_bio->sector + max_sectors
- - mbio->bi_sector;
+ - mbio->bi_iter.bi_sector;
r10_bio->sectors = max_sectors;
spin_lock_irq(&conf->device_lock);
if (mbio->bi_phys_segments == 0)
set_bit(R10BIO_ReadError,
&r10_bio->state);
r10_bio->mddev = mddev;
- r10_bio->sector = mbio->bi_sector
+ r10_bio->sector = mbio->bi_iter.bi_sector
+ sectors_handled;
goto read_more;
bio->bi_end_io = end_sync_read;
bio->bi_rw = READ;
from_addr = r10_bio->devs[j].addr;
- bio->bi_sector = from_addr + rdev->data_offset;
+ bio->bi_iter.bi_sector = from_addr +
+ rdev->data_offset;
bio->bi_bdev = rdev->bdev;
atomic_inc(&rdev->nr_pending);
/* and we write to 'i' (if not in_sync) */
bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio->bi_rw = WRITE;
- bio->bi_sector = to_addr
+ bio->bi_iter.bi_sector = to_addr
+ rdev->data_offset;
bio->bi_bdev = rdev->bdev;
atomic_inc(&r10_bio->remaining);
bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio->bi_rw = WRITE;
- bio->bi_sector = to_addr + rdev->data_offset;
+ bio->bi_iter.bi_sector = to_addr +
+ rdev->data_offset;
bio->bi_bdev = rdev->bdev;
atomic_inc(&r10_bio->remaining);
break;
if (j == conf->copies) {
/* Cannot recover, so abort the recovery or
* record a bad block */
- put_buf(r10_bio);
- if (rb2)
- atomic_dec(&rb2->remaining);
- r10_bio = rb2;
if (any_working) {
/* problem is that there are bad blocks
* on other device(s)
mirror->recovery_disabled
= mddev->recovery_disabled;
}
+ put_buf(r10_bio);
+ if (rb2)
+ atomic_dec(&rb2->remaining);
+ r10_bio = rb2;
break;
}
}
bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_read;
bio->bi_rw = READ;
- bio->bi_sector = sector +
+ bio->bi_iter.bi_sector = sector +
conf->mirrors[d].rdev->data_offset;
bio->bi_bdev = conf->mirrors[d].rdev->bdev;
count++;
bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio->bi_rw = WRITE;
- bio->bi_sector = sector +
+ bio->bi_iter.bi_sector = sector +
conf->mirrors[d].replacement->data_offset;
bio->bi_bdev = conf->mirrors[d].replacement->bdev;
count++;
bio2 = bio2->bi_next) {
/* remove last page from this bio */
bio2->bi_vcnt--;
- bio2->bi_size -= len;
+ bio2->bi_iter.bi_size -= len;
bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
}
goto bio_full;
!test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0;
mddev->degraded++;
- if (disk->rdev)
+ if (disk->rdev &&
+ disk->rdev->saved_raid_disk < 0)
conf->fullsync = 1;
}
disk->recovery_disabled = mddev->recovery_disabled - 1;
read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
read_bio->bi_bdev = rdev->bdev;
- read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
+ read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
+ rdev->data_offset);
read_bio->bi_private = r10_bio;
read_bio->bi_end_io = end_sync_read;
read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
read_bio->bi_flags |= 1 << BIO_UPTODATE;
read_bio->bi_vcnt = 0;
- read_bio->bi_size = 0;
+ read_bio->bi_iter.bi_size = 0;
r10_bio->master_bio = read_bio;
r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
bio_reset(b);
b->bi_bdev = rdev2->bdev;
- b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
+ b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
+ rdev2->new_data_offset;
b->bi_private = r10_bio;
b->bi_end_io = end_reshape_write;
b->bi_rw = WRITE;
bio2 = bio2->bi_next) {
/* Remove last page from this bio */
bio2->bi_vcnt--;
- bio2->bi_size -= len;
+ bio2->bi_iter.bi_size -= len;
bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
}
goto bio_full;
static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
{
int sectors = bio_sectors(bio);
- if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
+ if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
return bio->bi_next;
else
return NULL;
return_bi = bi->bi_next;
bi->bi_next = NULL;
- bi->bi_size = 0;
+ bi->bi_iter.bi_size = 0;
trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
bi, 0);
bio_endio(bi, 0);
|| !conf->inactive_blocked),
*(conf->hash_locks + hash));
conf->inactive_blocked = 0;
- } else
+ } else {
init_stripe(sh, sector, previous);
+ atomic_inc(&sh->count);
+ }
} else {
spin_lock(&conf->device_lock);
if (atomic_read(&sh->count)) {
} else {
if (!test_bit(STRIPE_HANDLE, &sh->state))
atomic_inc(&conf->active_stripes);
- BUG_ON(list_empty(&sh->lru));
+ BUG_ON(list_empty(&sh->lru) &&
+ !test_bit(STRIPE_EXPANDING, &sh->state));
list_del_init(&sh->lru);
if (sh->group) {
sh->group->stripes_cnt--;
sh->group = NULL;
}
}
+ atomic_inc(&sh->count);
spin_unlock(&conf->device_lock);
}
} while (sh == NULL);
- if (sh)
- atomic_inc(&sh->count);
-
spin_unlock_irq(conf->hash_locks + hash);
return sh;
}
bi->bi_rw, i);
atomic_inc(&sh->count);
if (use_new_offset(conf, sh))
- bi->bi_sector = (sh->sector
+ bi->bi_iter.bi_sector = (sh->sector
+ rdev->new_data_offset);
else
- bi->bi_sector = (sh->sector
+ bi->bi_iter.bi_sector = (sh->sector
+ rdev->data_offset);
if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
bi->bi_rw |= REQ_NOMERGE;
bi->bi_vcnt = 1;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
bi->bi_io_vec[0].bv_offset = 0;
- bi->bi_size = STRIPE_SIZE;
+ bi->bi_iter.bi_size = STRIPE_SIZE;
/*
* If this is discard request, set bi_vcnt 0. We don't
* want to confuse SCSI because SCSI will replace payload
rbi->bi_rw, i);
atomic_inc(&sh->count);
if (use_new_offset(conf, sh))
- rbi->bi_sector = (sh->sector
+ rbi->bi_iter.bi_sector = (sh->sector
+ rrdev->new_data_offset);
else
- rbi->bi_sector = (sh->sector
+ rbi->bi_iter.bi_sector = (sh->sector
+ rrdev->data_offset);
rbi->bi_vcnt = 1;
rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
rbi->bi_io_vec[0].bv_offset = 0;
- rbi->bi_size = STRIPE_SIZE;
+ rbi->bi_iter.bi_size = STRIPE_SIZE;
/*
* If this is discard request, set bi_vcnt 0. We don't
* want to confuse SCSI because SCSI will replace payload
async_copy_data(int frombio, struct bio *bio, struct page *page,
sector_t sector, struct dma_async_tx_descriptor *tx)
{
- struct bio_vec *bvl;
+ struct bio_vec bvl;
+ struct bvec_iter iter;
struct page *bio_page;
- int i;
int page_offset;
struct async_submit_ctl submit;
enum async_tx_flags flags = 0;
- if (bio->bi_sector >= sector)
- page_offset = (signed)(bio->bi_sector - sector) * 512;
+ if (bio->bi_iter.bi_sector >= sector)
+ page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
else
- page_offset = (signed)(sector - bio->bi_sector) * -512;
+ page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
if (frombio)
flags |= ASYNC_TX_FENCE;
init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
- bio_for_each_segment(bvl, bio, i) {
- int len = bvl->bv_len;
+ bio_for_each_segment(bvl, bio, iter) {
+ int len = bvl.bv_len;
int clen;
int b_offset = 0;
clen = len;
if (clen > 0) {
- b_offset += bvl->bv_offset;
- bio_page = bvl->bv_page;
+ b_offset += bvl.bv_offset;
+ bio_page = bvl.bv_page;
if (frombio)
tx = async_memcpy(page, bio_page, page_offset,
b_offset, clen, &submit);
BUG_ON(!dev->read);
rbi = dev->read;
dev->read = NULL;
- while (rbi && rbi->bi_sector <
+ while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector);
if (!raid5_dec_bi_active_stripes(rbi)) {
dev->read = rbi = dev->toread;
dev->toread = NULL;
spin_unlock_irq(&sh->stripe_lock);
- while (rbi && rbi->bi_sector <
+ while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
tx = async_copy_data(0, rbi, dev->page,
dev->sector, tx);
wbi = dev->written = chosen;
spin_unlock_irq(&sh->stripe_lock);
- while (wbi && wbi->bi_sector <
+ while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
if (wbi->bi_rw & REQ_FUA)
set_bit(R5_WantFUA, &dev->flags);
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
if (!uptodate) {
+ set_bit(STRIPE_DEGRADED, &sh->state);
set_bit(WriteErrorSeen, &rdev->flags);
set_bit(R5_WriteError, &sh->dev[i].flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
int firstwrite=0;
pr_debug("adding bi b#%llu to stripe s#%llu\n",
- (unsigned long long)bi->bi_sector,
+ (unsigned long long)bi->bi_iter.bi_sector,
(unsigned long long)sh->sector);
/*
firstwrite = 1;
} else
bip = &sh->dev[dd_idx].toread;
- while (*bip && (*bip)->bi_sector < bi->bi_sector) {
- if (bio_end_sector(*bip) > bi->bi_sector)
+ while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
+ if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
goto overlap;
bip = & (*bip)->bi_next;
}
- if (*bip && (*bip)->bi_sector < bio_end_sector(bi))
+ if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
goto overlap;
BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
sector_t sector = sh->dev[dd_idx].sector;
for (bi=sh->dev[dd_idx].towrite;
sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
- bi && bi->bi_sector <= sector;
+ bi && bi->bi_iter.bi_sector <= sector;
bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
if (bio_end_sector(bi) >= sector)
sector = bio_end_sector(bi);
}
pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
- (unsigned long long)(*bip)->bi_sector,
+ (unsigned long long)(*bip)->bi_iter.bi_sector,
(unsigned long long)sh->sector, dd_idx);
spin_unlock_irq(&sh->stripe_lock);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
- while (bi && bi->bi_sector <
+ while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
if (bi) bitmap_end = 1;
- while (bi && bi->bi_sector <
+ while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
spin_unlock_irq(&sh->stripe_lock);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
- while (bi && bi->bi_sector <
+ while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi =
r5_next_bio(bi, sh->dev[i].sector);
clear_bit(R5_UPTODATE, &dev->flags);
wbi = dev->written;
dev->written = NULL;
- while (wbi && wbi->bi_sector <
+ while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
if (!raid5_dec_bi_active_stripes(wbi)) {
*/
set_bit(R5_Insync, &dev->flags);
- if (rdev && test_bit(R5_WriteError, &dev->flags)) {
+ if (test_bit(R5_WriteError, &dev->flags)) {
/* This flag does not apply to '.replacement'
* only to .rdev, so make sure to check that*/
struct md_rdev *rdev2 = rcu_dereference(
} else
clear_bit(R5_WriteError, &dev->flags);
}
- if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
+ if (test_bit(R5_MadeGood, &dev->flags)) {
/* This flag does not apply to '.replacement'
* only to .rdev, so make sure to check that*/
struct md_rdev *rdev2 = rcu_dereference(
static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
{
- sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+ sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
unsigned int chunk_sectors = mddev->chunk_sectors;
unsigned int bio_sectors = bio_sectors(bio);
/*
* compute position
*/
- align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector,
- 0,
- &dd_idx, NULL);
+ align_bi->bi_iter.bi_sector =
+ raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
+ 0, &dd_idx, NULL);
end_sector = bio_end_sector(align_bi);
rcu_read_lock();
align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
if (!bio_fits_rdev(align_bi) ||
- is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi),
+ is_badblock(rdev, align_bi->bi_iter.bi_sector,
+ bio_sectors(align_bi),
&first_bad, &bad_sectors)) {
/* too big in some way, or has a known bad block */
bio_put(align_bi);
}
/* No reshape active, so we can trust rdev->data_offset */
- align_bi->bi_sector += rdev->data_offset;
+ align_bi->bi_iter.bi_sector += rdev->data_offset;
spin_lock_irq(&conf->device_lock);
wait_event_lock_irq(conf->wait_for_stripe,
if (mddev->gendisk)
trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
align_bi, disk_devt(mddev->gendisk),
- raid_bio->bi_sector);
+ raid_bio->bi_iter.bi_sector);
generic_make_request(align_bi);
return 1;
} else {
/* Skip discard while reshape is happening */
return;
- logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
- last_sector = bi->bi_sector + (bi->bi_size>>9);
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
bi->bi_next = NULL;
bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
return;
}
- logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
int remaining;
int handled = 0;
- logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ logical_sector = raid_bio->bi_iter.bi_sector &
+ ~((sector_t)STRIPE_SECTORS-1);
sector = raid5_compute_sector(conf, logical_sector,
0, &dd_idx, NULL);
last_sector = bio_end_sector(raid_bio);
static void xpram_make_request(struct request_queue *q, struct bio *bio)
{
xpram_device_t *xdev = bio->bi_bdev->bd_disk->private_data;
- struct bio_vec *bvec;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
unsigned int index;
unsigned long page_addr;
unsigned long bytes;
- int i;
- if ((bio->bi_sector & 7) != 0 || (bio->bi_size & 4095) != 0)
+ if ((bio->bi_iter.bi_sector & 7) != 0 ||
+ (bio->bi_iter.bi_size & 4095) != 0)
/* Request is not page-aligned. */
goto fail;
- if ((bio->bi_size >> 12) > xdev->size)
+ if ((bio->bi_iter.bi_size >> 12) > xdev->size)
/* Request size is no page-aligned. */
goto fail;
- if ((bio->bi_sector >> 3) > 0xffffffffU - xdev->offset)
+ if ((bio->bi_iter.bi_sector >> 3) > 0xffffffffU - xdev->offset)
goto fail;
- index = (bio->bi_sector >> 3) + xdev->offset;
- bio_for_each_segment(bvec, bio, i) {
+ index = (bio->bi_iter.bi_sector >> 3) + xdev->offset;
+ bio_for_each_segment(bvec, bio, iter) {
page_addr = (unsigned long)
- kmap(bvec->bv_page) + bvec->bv_offset;
- bytes = bvec->bv_len;
+ kmap(bvec.bv_page) + bvec.bv_offset;
+ bytes = bvec.bv_len;
if ((page_addr & 4095) != 0 || (bytes & 4095) != 0)
/* More paranoia. */
goto fail;
unsigned long mem_needed;
unsigned long mem_auto;
unsigned long long size;
+ char *sizes_end;
int mem_auto_no;
int i;
mem_auto_no = 0;
for (i = 0; i < xpram_devs; i++) {
if (sizes[i]) {
- size = simple_strtoull(sizes[i], &sizes[i], 0);
- switch (sizes[i][0]) {
+ size = simple_strtoull(sizes[i], &sizes_end, 0);
+ switch (*sizes_end) {
case 'g':
case 'G':
size <<= 20;
static int sd_resume(struct device *);
static void sd_rescan(struct device *);
static int sd_done(struct scsi_cmnd *);
-static int sd_eh_action(struct scsi_cmnd *, unsigned char *, int, int);
+static int sd_eh_action(struct scsi_cmnd *, int);
static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer);
static void scsi_disk_release(struct device *cdev);
static void sd_print_sense_hdr(struct scsi_disk *, struct scsi_sense_hdr *);
if (sdkp->device->no_write_same)
return BLKPREP_KILL;
- BUG_ON(bio_offset(bio) || bio_iovec(bio)->bv_len != sdp->sector_size);
+ BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
sector >>= ilog2(sdp->sector_size) - 9;
nr_sectors >>= ilog2(sdp->sector_size) - 9;
/**
* sd_eh_action - error handling callback
* @scmd: sd-issued command that has failed
- * @eh_cmnd: The command that was sent during error handling
- * @eh_cmnd_len: Length of eh_cmnd in bytes
* @eh_disp: The recovery disposition suggested by the midlayer
*
- * This function is called by the SCSI midlayer upon completion of
- * an error handling command (TEST UNIT READY, START STOP UNIT,
- * etc.) The command sent to the device by the error handler is
- * stored in eh_cmnd. The result of sending the eh command is
- * passed in eh_disp.
+ * This function is called by the SCSI midlayer upon completion of an
+ * error test command (currently TEST UNIT READY). The result of sending
+ * the eh command is passed in eh_disp. We're looking for devices that
+ * fail medium access commands but are OK with non access commands like
+ * test unit ready (so wrongly see the device as having a successful
+ * recovery)
**/
-static int sd_eh_action(struct scsi_cmnd *scmd, unsigned char *eh_cmnd,
- int eh_cmnd_len, int eh_disp)
+static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp)
{
struct scsi_disk *sdkp = scsi_disk(scmd->request->rq_disk);
if (!scsi_device_online(scmd->device) ||
- !scsi_medium_access_command(scmd))
+ !scsi_medium_access_command(scmd) ||
+ host_byte(scmd->result) != DID_TIME_OUT ||
+ eh_disp != SUCCESS)
return eh_disp;
/*
* process of recovering or has it suffered an internal failure
* that prevents access to the storage medium.
*/
- if (host_byte(scmd->result) == DID_TIME_OUT && eh_disp == SUCCESS &&
- eh_cmnd_len && eh_cmnd[0] == TEST_UNIT_READY)
- sdkp->medium_access_timed_out++;
+ sdkp->medium_access_timed_out++;
/*
* If the device keeps failing read/write commands but TEST UNIT
end_lba <<= 1;
} else {
/* be careful ... don't want any overflows */
- u64 factor = scmd->device->sector_size / 512;
+ unsigned int factor = scmd->device->sector_size / 512;
do_div(start_lba, factor);
do_div(end_lba, factor);
}
struct cl_object *obj = ll_i2info(inode)->lli_clob;
pgoff_t offset;
int ret;
- int i;
int rw;
obd_count page_count = 0;
- struct bio_vec *bvec;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
struct bio *bio;
ssize_t bytes;
for (bio = head; bio != NULL; bio = bio->bi_next) {
LASSERT(rw == bio->bi_rw);
- offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset;
- bio_for_each_segment(bvec, bio, i) {
- BUG_ON(bvec->bv_offset != 0);
- BUG_ON(bvec->bv_len != PAGE_CACHE_SIZE);
+ offset = (pgoff_t)(bio->bi_iter.bi_sector << 9) + lo->lo_offset;
+ bio_for_each_segment(bvec, bio, iter) {
+ BUG_ON(bvec.bv_offset != 0);
+ BUG_ON(bvec.bv_len != PAGE_CACHE_SIZE);
- pages[page_count] = bvec->bv_page;
+ pages[page_count] = bvec.bv_page;
offsets[page_count] = offset;
page_count++;
- offset += bvec->bv_len;
+ offset += bvec.bv_len;
}
LASSERT(page_count <= LLOOP_MAX_SEGMENTS);
}
bio = &lo->lo_bio;
while (*bio && (*bio)->bi_rw == rw) {
CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n",
- (unsigned long long)(*bio)->bi_sector, (*bio)->bi_size,
+ (unsigned long long)(*bio)->bi_iter.bi_sector,
+ (*bio)->bi_iter.bi_size,
page_count, (*bio)->bi_vcnt);
if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS)
break;
goto err;
CDEBUG(D_INFO, "submit bio sector %llu size %u\n",
- (unsigned long long)old_bio->bi_sector, old_bio->bi_size);
+ (unsigned long long)old_bio->bi_iter.bi_sector,
+ old_bio->bi_iter.bi_size);
spin_lock_irq(&lo->lo_lock);
inactive = (lo->lo_state != LLOOP_BOUND);
loop_add_bio(lo, old_bio);
return;
err:
- cfs_bio_io_error(old_bio, old_bio->bi_size);
+ cfs_bio_io_error(old_bio, old_bio->bi_iter.bi_size);
}
while (bio) {
struct bio *tmp = bio->bi_next;
bio->bi_next = NULL;
- cfs_bio_endio(bio, bio->bi_size, ret);
+ cfs_bio_endio(bio, bio->bi_iter.bi_size, ret);
bio = tmp;
}
}
module_init(lloop_init);
module_exit(lloop_exit);
-CFS_MODULE_PARM(max_loop, "i", int, 0444, "maximum of lloop_device");
+module_param(max_loop, int, 0444);
+MODULE_PARM_DESC(max_loop, "maximum of lloop_device");
MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
MODULE_DESCRIPTION("Lustre virtual block device");
MODULE_LICENSE("GPL");
unsigned long bio_flags)
{
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
- u64 logical = (u64)bio->bi_sector << 9;
+ u64 logical = (u64)bio->bi_iter.bi_sector << 9;
u64 length = 0;
u64 map_length;
int ret;
if (bio_flags & EXTENT_BIO_COMPRESSED)
return 0;
- length = bio->bi_size;
+ length = bio->bi_iter.bi_size;
map_length = length;
ret = btrfs_map_block(root->fs_info, rw, logical,
&map_length, NULL, 0);
* these flags set. For all other operations the VFS set these flags
* explicitly if it wants a timestamp update.
*/
- if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
- inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
+ if (newsize != oldsize) {
+ inode_inc_iversion(inode);
+ if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
+ inode->i_ctime = inode->i_mtime =
+ current_fs_time(inode->i_sb);
+ }
if (newsize > oldsize) {
truncate_pagecache(inode, newsize);
err = btrfs_dirty_inode(inode);
if (!err && attr->ia_valid & ATTR_MODE)
- err = btrfs_acl_chmod(inode);
+ err = posix_acl_chmod(inode, inode->i_mode);
}
return err;
static void btrfs_endio_direct_read(struct bio *bio, int err)
{
struct btrfs_dio_private *dip = bio->bi_private;
- struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
- struct bio_vec *bvec = bio->bi_io_vec;
+ struct bio_vec *bvec;
struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct bio *dio_bio;
u32 *csums = (u32 *)dip->csum;
- int index = 0;
u64 start;
+ int i;
start = dip->logical_offset;
- do {
+ bio_for_each_segment_all(bvec, bio, i) {
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
struct page *page = bvec->bv_page;
char *kaddr;
local_irq_restore(flags);
flush_dcache_page(bvec->bv_page);
- if (csum != csums[index]) {
+ if (csum != csums[i]) {
btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
btrfs_ino(inode), start, csum,
- csums[index]);
+ csums[i]);
err = -EIO;
}
}
start += bvec->bv_len;
- bvec++;
- index++;
- } while (bvec <= bvec_end);
+ }
unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
dip->logical_offset + dip->bytes - 1);
printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
"sector %#Lx len %u err no %d\n",
btrfs_ino(dip->inode), bio->bi_rw,
- (unsigned long long)bio->bi_sector, bio->bi_size, err);
+ (unsigned long long)bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size, err);
dip->errors = 1;
/*
struct bio *bio;
struct bio *orig_bio = dip->orig_bio;
struct bio_vec *bvec = orig_bio->bi_io_vec;
- u64 start_sector = orig_bio->bi_sector;
+ u64 start_sector = orig_bio->bi_iter.bi_sector;
u64 file_offset = dip->logical_offset;
u64 submit_len = 0;
u64 map_length;
int ret = 0;
int async_submit = 0;
- map_length = orig_bio->bi_size;
+ map_length = orig_bio->bi_iter.bi_size;
ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
&map_length, NULL, 0);
if (ret) {
return -EIO;
}
- if (map_length >= orig_bio->bi_size) {
+ if (map_length >= orig_bio->bi_iter.bi_size) {
bio = orig_bio;
goto submit;
}
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
- map_length = orig_bio->bi_size;
+ map_length = orig_bio->bi_iter.bi_size;
ret = btrfs_map_block(root->fs_info, rw,
start_sector << 9,
&map_length, NULL, 0);
if (!skip_sum && !write) {
csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
- sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits;
+ sum_len = dio_bio->bi_iter.bi_size >>
+ inode->i_sb->s_blocksize_bits;
sum_len *= csum_size;
} else {
sum_len = 0;
dip->private = dio_bio->bi_private;
dip->inode = inode;
dip->logical_offset = file_offset;
- dip->bytes = dio_bio->bi_size;
- dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
+ dip->bytes = dio_bio->bi_iter.bi_size;
+ dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
io_bio->bi_private = dip;
dip->errors = 0;
dip->orig_bio = io_bio;
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
.get_acl = btrfs_get_acl,
+ .set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
};
static const struct inode_operations btrfs_dir_ro_inode_operations = {
.lookup = btrfs_lookup,
.permission = btrfs_permission,
.get_acl = btrfs_get_acl,
+ .set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
};
.permission = btrfs_permission,
.fiemap = btrfs_fiemap,
.get_acl = btrfs_get_acl,
+ .set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
};
static const struct inode_operations btrfs_special_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.get_acl = btrfs_get_acl,
+ .set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
};
static const struct inode_operations btrfs_symlink_inode_operations = {
.getxattr = btrfs_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
- .get_acl = btrfs_get_acl,
.update_time = btrfs_update_time,
};
#include "segment.h"
#include <trace/events/f2fs.h>
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+static void f2fs_read_end_io(struct bio *bio, int err)
+{
- do {
++ struct bio_vec *bvec;
++ int i;
+
- if (--bvec >= bio->bi_io_vec)
- prefetchw(&bvec->bv_page->flags);
-
- if (unlikely(!uptodate)) {
++ bio_for_each_segment_all(bvec, bio, i) {
+ struct page *page = bvec->bv_page;
+
- } else {
- SetPageUptodate(page);
++ if (!err) {
++ SetPageUptodate(page);
++ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
- } while (bvec >= bio->bi_io_vec);
-
+ }
+ unlock_page(page);
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
- struct f2fs_sb_info *sbi = F2FS_SB(bvec->bv_page->mapping->host->i_sb);
++ }
+ bio_put(bio);
+}
+
+static void f2fs_write_end_io(struct bio *bio, int err)
+{
- do {
++ struct f2fs_sb_info *sbi = F2FS_SB(bio->bi_io_vec->bv_page->mapping->host->i_sb);
++ struct bio_vec *bvec;
++ int i;
+
- if (--bvec >= bio->bi_io_vec)
- prefetchw(&bvec->bv_page->flags);
-
- if (unlikely(!uptodate)) {
++ bio_for_each_segment_all(bvec, bio, i) {
+ struct page *page = bvec->bv_page;
+
- } while (bvec >= bio->bi_io_vec);
++ if (unlikely(err)) {
+ SetPageError(page);
+ set_bit(AS_EIO, &page->mapping->flags);
+ set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+ sbi->sb->s_flags |= MS_RDONLY;
+ }
+ end_page_writeback(page);
+ dec_page_count(sbi, F2FS_WRITEBACK);
- bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
++ }
+
+ if (bio->bi_private)
+ complete(bio->bi_private);
+
+ if (!get_pages(sbi, F2FS_WRITEBACK) &&
+ !list_empty(&sbi->cp_wait.task_list))
+ wake_up(&sbi->cp_wait);
+
+ bio_put(bio);
+}
+
+/*
+ * Low-level block read/write IO operations.
+ */
+static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
+ int npages, bool is_read)
+{
+ struct bio *bio;
+
+ /* No failure on bio allocation */
+ bio = bio_alloc(GFP_NOIO, npages);
+
+ bio->bi_bdev = sbi->sb->s_bdev;
++ bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+ bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
+
+ return bio;
+}
+
+static void __submit_merged_bio(struct f2fs_bio_info *io)
+{
+ struct f2fs_io_info *fio = &io->fio;
+ int rw;
+
+ if (!io->bio)
+ return;
+
+ rw = fio->rw;
+
+ if (is_read_io(rw)) {
+ trace_f2fs_submit_read_bio(io->sbi->sb, rw,
+ fio->type, io->bio);
+ submit_bio(rw, io->bio);
+ } else {
+ trace_f2fs_submit_write_bio(io->sbi->sb, rw,
+ fio->type, io->bio);
+ /*
+ * META_FLUSH is only from the checkpoint procedure, and we
+ * should wait this metadata bio for FS consistency.
+ */
+ if (fio->type == META_FLUSH) {
+ DECLARE_COMPLETION_ONSTACK(wait);
+ io->bio->bi_private = &wait;
+ submit_bio(rw, io->bio);
+ wait_for_completion(&wait);
+ } else {
+ submit_bio(rw, io->bio);
+ }
+ }
+
+ io->bio = NULL;
+}
+
+void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
+ enum page_type type, int rw)
+{
+ enum page_type btype = PAGE_TYPE_OF_BIO(type);
+ struct f2fs_bio_info *io;
+
+ io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
+
+ mutex_lock(&io->io_mutex);
+
+ /* change META to META_FLUSH in the checkpoint procedure */
+ if (type >= META_FLUSH) {
+ io->fio.type = META_FLUSH;
+ io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
+ }
+ __submit_merged_bio(io);
+ mutex_unlock(&io->io_mutex);
+}
+
+/*
+ * Fill the locked page with data located in the block address.
+ * Return unlocked page.
+ */
+int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
+ block_t blk_addr, int rw)
+{
+ struct bio *bio;
+
+ trace_f2fs_submit_page_bio(page, blk_addr, rw);
+
+ /* Allocate a new bio */
+ bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw));
+
+ if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+ bio_put(bio);
+ f2fs_put_page(page, 1);
+ return -EFAULT;
+ }
+
+ submit_bio(rw, bio);
+ return 0;
+}
+
+void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
+ block_t blk_addr, struct f2fs_io_info *fio)
+{
+ enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
+ struct f2fs_bio_info *io;
+ bool is_read = is_read_io(fio->rw);
+
+ io = is_read ? &sbi->read_io : &sbi->write_io[btype];
+
+ verify_block_addr(sbi, blk_addr);
+
+ mutex_lock(&io->io_mutex);
+
+ if (!is_read)
+ inc_page_count(sbi, F2FS_WRITEBACK);
+
+ if (io->bio && (io->last_block_in_bio != blk_addr - 1 ||
+ io->fio.rw != fio->rw))
+ __submit_merged_bio(io);
+alloc_new:
+ if (io->bio == NULL) {
+ int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+
+ io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
+ io->fio = *fio;
+ }
+
+ if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
+ PAGE_CACHE_SIZE) {
+ __submit_merged_bio(io);
+ goto alloc_new;
+ }
+
+ io->last_block_in_bio = blk_addr;
+
+ mutex_unlock(&io->io_mutex);
+ trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr);
+}
+
/*
* Lock ordering for the change of data block address:
* ->data_page
struct page *node_page = dn->node_page;
unsigned int ofs_in_node = dn->ofs_in_node;
- f2fs_wait_on_page_writeback(node_page, NODE, false);
+ f2fs_wait_on_page_writeback(node_page, NODE);
rn = F2FS_NODE(node_page);
{
struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
- if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
+ if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
return -EPERM;
- if (!inc_valid_block_count(sbi, dn->inode, 1))
+ if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
return -ENOSPC;
trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
__set_data_blkaddr(dn, NEW_ADDR);
dn->data_blkaddr = NEW_ADDR;
+ mark_inode_dirty(dn->inode);
sync_inode_page(dn);
return 0;
}
+int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
+{
+ bool need_put = dn->inode_page ? false : true;
+ int err;
+
+ /* if inode_page exists, index should be zero */
+ f2fs_bug_on(!need_put && index);
+
+ err = get_dnode_of_data(dn, index, ALLOC_NODE);
+ if (err)
+ return err;
+
+ if (dn->data_blkaddr == NULL_ADDR)
+ err = reserve_new_block(dn);
+ if (err || need_put)
+ f2fs_put_dnode(dn);
+ return err;
+}
+
static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
struct buffer_head *bh_result)
{
pgoff_t start_fofs, end_fofs;
block_t start_blkaddr;
+ if (is_inode_flag_set(fi, FI_NO_EXTENT))
+ return 0;
+
read_lock(&fi->ext.ext_lock);
if (fi->ext.len == 0) {
read_unlock(&fi->ext.ext_lock);
struct f2fs_inode_info *fi = F2FS_I(dn->inode);
pgoff_t fofs, start_fofs, end_fofs;
block_t start_blkaddr, end_blkaddr;
+ int need_update = true;
f2fs_bug_on(blk_addr == NEW_ADDR);
fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
/* Update the page address in the parent node */
__set_data_blkaddr(dn, blk_addr);
+ if (is_inode_flag_set(fi, FI_NO_EXTENT))
+ return;
+
write_lock(&fi->ext.ext_lock);
start_fofs = fi->ext.fofs;
fofs - start_fofs + 1;
fi->ext.len -= fofs - start_fofs + 1;
}
- goto end_update;
+ } else {
+ need_update = false;
}
- write_unlock(&fi->ext.ext_lock);
- return;
+ /* Finally, if the extent is very fragmented, let's drop the cache. */
+ if (fi->ext.len < F2FS_MIN_EXTENT_LEN) {
+ fi->ext.len = 0;
+ set_inode_flag(fi, FI_NO_EXTENT);
+ need_update = true;
+ }
end_update:
write_unlock(&fi->ext.ext_lock);
- sync_inode_page(dn);
+ if (need_update)
+ sync_inode_page(dn);
+ return;
}
struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
return ERR_PTR(-ENOENT);
/* By fallocate(), there is no cached page, but with NEW_ADDR */
- if (dn.data_blkaddr == NEW_ADDR)
+ if (unlikely(dn.data_blkaddr == NEW_ADDR))
return ERR_PTR(-EINVAL);
page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
return page;
}
- err = f2fs_readpage(sbi, page, dn.data_blkaddr,
+ err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
sync ? READ_SYNC : READA);
+ if (err)
+ return ERR_PTR(err);
+
if (sync) {
wait_on_page_locked(page);
- if (!PageUptodate(page)) {
+ if (unlikely(!PageUptodate(page))) {
f2fs_put_page(page, 0);
return ERR_PTR(-EIO);
}
}
f2fs_put_dnode(&dn);
- if (dn.data_blkaddr == NULL_ADDR) {
+ if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
f2fs_put_page(page, 1);
return ERR_PTR(-ENOENT);
}
return page;
}
- err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+ err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC);
if (err)
return ERR_PTR(err);
lock_page(page);
- if (!PageUptodate(page)) {
+ if (unlikely(!PageUptodate(page))) {
f2fs_put_page(page, 1);
return ERR_PTR(-EIO);
}
- if (page->mapping != mapping) {
+ if (unlikely(page->mapping != mapping)) {
f2fs_put_page(page, 1);
goto repeat;
}
* Caller ensures that this data page is never allocated.
* A new zero-filled data page is allocated in the page cache.
*
- * Also, caller should grab and release a mutex by calling mutex_lock_op() and
- * mutex_unlock_op().
- * Note that, npage is set only by make_empty_dir.
+ * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op().
+ * Note that, ipage is set only by make_empty_dir.
*/
struct page *get_new_data_page(struct inode *inode,
- struct page *npage, pgoff_t index, bool new_i_size)
+ struct page *ipage, pgoff_t index, bool new_i_size)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
int err;
- set_new_dnode(&dn, inode, npage, npage, 0);
- err = get_dnode_of_data(&dn, index, ALLOC_NODE);
+ set_new_dnode(&dn, inode, ipage, NULL, 0);
+ err = f2fs_reserve_block(&dn, index);
if (err)
return ERR_PTR(err);
-
- if (dn.data_blkaddr == NULL_ADDR) {
- if (reserve_new_block(&dn)) {
- if (!npage)
- f2fs_put_dnode(&dn);
- return ERR_PTR(-ENOSPC);
- }
- }
- if (!npage)
- f2fs_put_dnode(&dn);
repeat:
page = grab_cache_page(mapping, index);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ if (!page) {
+ err = -ENOMEM;
+ goto put_err;
+ }
if (PageUptodate(page))
return page;
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
SetPageUptodate(page);
} else {
- err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+ err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+ READ_SYNC);
if (err)
- return ERR_PTR(err);
+ goto put_err;
+
lock_page(page);
- if (!PageUptodate(page)) {
+ if (unlikely(!PageUptodate(page))) {
f2fs_put_page(page, 1);
- return ERR_PTR(-EIO);
+ err = -EIO;
+ goto put_err;
}
- if (page->mapping != mapping) {
+ if (unlikely(page->mapping != mapping)) {
f2fs_put_page(page, 1);
goto repeat;
}
i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
/* Only the directory inode sets new_i_size */
set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
- mark_inode_dirty_sync(inode);
}
return page;
-}
-
-static void read_end_io(struct bio *bio, int err)
-{
- struct bio_vec *bvec;
- int i;
-
- bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
- if (!err) {
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- unlock_page(page);
- }
- bio_put(bio);
+put_err:
+ f2fs_put_dnode(&dn);
+ return ERR_PTR(err);
}
-/*
- * Fill the locked page with data located in the block address.
- * Return unlocked page.
- */
-int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
- block_t blk_addr, int type)
+static int __allocate_data_block(struct dnode_of_data *dn)
{
- struct block_device *bdev = sbi->sb->s_bdev;
- struct bio *bio;
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_summary sum;
+ block_t new_blkaddr;
+ struct node_info ni;
+ int type;
- trace_f2fs_readpage(page, blk_addr, type);
+ if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+ return -EPERM;
+ if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
+ return -ENOSPC;
- down_read(&sbi->bio_sem);
+ __set_data_blkaddr(dn, NEW_ADDR);
+ dn->data_blkaddr = NEW_ADDR;
- /* Allocate a new bio */
- bio = f2fs_bio_alloc(bdev, 1);
+ get_node_info(sbi, dn->nid, &ni);
+ set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
- /* Initialize the bio */
- bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
- bio->bi_end_io = read_end_io;
+ type = CURSEG_WARM_DATA;
- if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
- bio_put(bio);
- up_read(&sbi->bio_sem);
- f2fs_put_page(page, 1);
- return -EFAULT;
- }
+ allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type);
- submit_bio(type, bio);
- up_read(&sbi->bio_sem);
+ /* direct IO doesn't use extent cache to maximize the performance */
+ set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
+ update_extent_cache(new_blkaddr, dn);
+ clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
+
+ dn->data_blkaddr = new_blkaddr;
return 0;
}
/*
- * This function should be used by the data read flow only where it
- * does not check the "create" flag that indicates block allocation.
- * The reason for this special functionality is to exploit VFS readahead
- * mechanism.
+ * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
+ * If original data blocks are allocated, then give them to blockdev.
+ * Otherwise,
+ * a. preallocate requested block addresses
+ * b. do not use extent cache for better performance
+ * c. give the block addresses to blockdev
*/
-static int get_data_block_ro(struct inode *inode, sector_t iblock,
+static int get_data_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
unsigned int blkbits = inode->i_sb->s_blocksize_bits;
unsigned maxblocks = bh_result->b_size >> blkbits;
struct dnode_of_data dn;
- pgoff_t pgofs;
- int err;
+ int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
+ pgoff_t pgofs, end_offset;
+ int err = 0, ofs = 1;
+ bool allocated = false;
/* Get the page offset from the block offset(iblock) */
pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
- if (check_extent_cache(inode, pgofs, bh_result)) {
- trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
- return 0;
- }
+ if (check_extent_cache(inode, pgofs, bh_result))
+ goto out;
+
+ if (create)
+ f2fs_lock_op(sbi);
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
+ err = get_dnode_of_data(&dn, pgofs, mode);
if (err) {
- trace_f2fs_get_data_block(inode, iblock, bh_result, err);
- return (err == -ENOENT) ? 0 : err;
+ if (err == -ENOENT)
+ err = 0;
+ goto unlock_out;
}
+ if (dn.data_blkaddr == NEW_ADDR)
+ goto put_out;
- /* It does not support data allocation */
- f2fs_bug_on(create);
+ if (dn.data_blkaddr != NULL_ADDR) {
+ map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+ } else if (create) {
+ err = __allocate_data_block(&dn);
+ if (err)
+ goto put_out;
+ allocated = true;
+ map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+ } else {
+ goto put_out;
+ }
- if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) {
- int i;
- unsigned int end_offset;
+ end_offset = IS_INODE(dn.node_page) ?
+ ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
+ bh_result->b_size = (((size_t)1) << blkbits);
+ dn.ofs_in_node++;
+ pgofs++;
+
+get_next:
+ if (dn.ofs_in_node >= end_offset) {
+ if (allocated)
+ sync_inode_page(&dn);
+ allocated = false;
+ f2fs_put_dnode(&dn);
- end_offset = IS_INODE(dn.node_page) ?
- ADDRS_PER_INODE(F2FS_I(inode)) :
- ADDRS_PER_BLOCK;
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, pgofs, mode);
+ if (err) {
+ if (err == -ENOENT)
+ err = 0;
+ goto unlock_out;
+ }
+ if (dn.data_blkaddr == NEW_ADDR)
+ goto put_out;
- clear_buffer_new(bh_result);
+ end_offset = IS_INODE(dn.node_page) ?
+ ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
+ }
+ if (maxblocks > (bh_result->b_size >> blkbits)) {
+ block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+ if (blkaddr == NULL_ADDR && create) {
+ err = __allocate_data_block(&dn);
+ if (err)
+ goto sync_out;
+ allocated = true;
+ blkaddr = dn.data_blkaddr;
+ }
/* Give more consecutive addresses for the read ahead */
- for (i = 0; i < end_offset - dn.ofs_in_node; i++)
- if (((datablock_addr(dn.node_page,
- dn.ofs_in_node + i))
- != (dn.data_blkaddr + i)) || maxblocks == i)
- break;
- map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
- bh_result->b_size = (i << blkbits);
+ if (blkaddr == (bh_result->b_blocknr + ofs)) {
+ ofs++;
+ dn.ofs_in_node++;
+ pgofs++;
+ bh_result->b_size += (((size_t)1) << blkbits);
+ goto get_next;
+ }
}
+sync_out:
+ if (allocated)
+ sync_inode_page(&dn);
+put_out:
f2fs_put_dnode(&dn);
- trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
- return 0;
+unlock_out:
+ if (create)
+ f2fs_unlock_op(sbi);
+out:
+ trace_f2fs_get_data_block(inode, iblock, bh_result, err);
+ return err;
}
static int f2fs_read_data_page(struct file *file, struct page *page)
{
- return mpage_readpage(page, get_data_block_ro);
+ struct inode *inode = page->mapping->host;
+ int ret;
+
+ /* If the file has inline data, try to read it directlly */
+ if (f2fs_has_inline_data(inode))
+ ret = f2fs_read_inline_data(inode, page);
+ else
+ ret = mpage_readpage(page, get_data_block);
+
+ return ret;
}
static int f2fs_read_data_pages(struct file *file,
struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
- return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro);
+ struct inode *inode = file->f_mapping->host;
+
+ /* If the file has inline data, skip readpages */
+ if (f2fs_has_inline_data(inode))
+ return 0;
+
+ return mpage_readpages(mapping, pages, nr_pages, get_data_block);
}
-int do_write_data_page(struct page *page)
+int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
{
struct inode *inode = page->mapping->host;
- block_t old_blk_addr, new_blk_addr;
+ block_t old_blkaddr, new_blkaddr;
struct dnode_of_data dn;
int err = 0;
if (err)
return err;
- old_blk_addr = dn.data_blkaddr;
+ old_blkaddr = dn.data_blkaddr;
/* This page is already truncated */
- if (old_blk_addr == NULL_ADDR)
+ if (old_blkaddr == NULL_ADDR)
goto out_writepage;
set_page_writeback(page);
* If current allocation needs SSR,
* it had better in-place writes for updated data.
*/
- if (unlikely(old_blk_addr != NEW_ADDR &&
+ if (unlikely(old_blkaddr != NEW_ADDR &&
!is_cold_data(page) &&
need_inplace_update(inode))) {
- rewrite_data_page(F2FS_SB(inode->i_sb), page,
- old_blk_addr);
+ rewrite_data_page(page, old_blkaddr, fio);
} else {
- write_data_page(inode, page, &dn,
- old_blk_addr, &new_blk_addr);
- update_extent_cache(new_blk_addr, &dn);
+ write_data_page(page, &dn, &new_blkaddr, fio);
+ update_extent_cache(new_blkaddr, &dn);
}
out_writepage:
f2fs_put_dnode(&dn);
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = ((unsigned long long) i_size)
>> PAGE_CACHE_SHIFT;
- unsigned offset;
+ unsigned offset = 0;
bool need_balance_fs = false;
int err = 0;
+ struct f2fs_io_info fio = {
+ .type = DATA,
+ .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+ };
if (page->index < end_index)
goto write;
zero_user_segment(page, offset, PAGE_CACHE_SIZE);
write:
- if (sbi->por_doing) {
+ if (unlikely(sbi->por_doing)) {
err = AOP_WRITEPAGE_ACTIVATE;
goto redirty_out;
}
if (S_ISDIR(inode->i_mode)) {
dec_page_count(sbi, F2FS_DIRTY_DENTS);
inode_dec_dirty_dents(inode);
- err = do_write_data_page(page);
+ err = do_write_data_page(page, &fio);
} else {
f2fs_lock_op(sbi);
- err = do_write_data_page(page);
+
+ if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) {
+ err = f2fs_write_inline_data(inode, page, offset);
+ f2fs_unlock_op(sbi);
+ goto out;
+ } else {
+ err = do_write_data_page(page, &fio);
+ }
+
f2fs_unlock_op(sbi);
need_balance_fs = true;
}
else if (err)
goto redirty_out;
- if (wbc->for_reclaim)
- f2fs_submit_bio(sbi, DATA, true);
+ if (wbc->for_reclaim) {
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ need_balance_fs = false;
+ }
clear_cold_data(page);
out:
ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
if (locked)
mutex_unlock(&sbi->writepages);
- f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL));
+
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
remove_dirty_dir_inode(inode);
f2fs_balance_fs(sbi);
repeat:
+ err = f2fs_convert_inline_data(inode, pos + len);
+ if (err)
+ return err;
+
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
- f2fs_lock_op(sbi);
+ if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA)
+ goto inline_data;
+ f2fs_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, index, ALLOC_NODE);
- if (err)
- goto err;
-
- if (dn.data_blkaddr == NULL_ADDR)
- err = reserve_new_block(&dn);
-
- f2fs_put_dnode(&dn);
- if (err)
- goto err;
-
+ err = f2fs_reserve_block(&dn, index);
f2fs_unlock_op(sbi);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return err;
+ }
+inline_data:
if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
return 0;
if (dn.data_blkaddr == NEW_ADDR) {
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
} else {
- err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+ if (f2fs_has_inline_data(inode))
+ err = f2fs_read_inline_data(inode, page);
+ else
+ err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+ READ_SYNC);
if (err)
return err;
lock_page(page);
- if (!PageUptodate(page)) {
+ if (unlikely(!PageUptodate(page))) {
f2fs_put_page(page, 1);
return -EIO;
}
- if (page->mapping != mapping) {
+ if (unlikely(page->mapping != mapping)) {
f2fs_put_page(page, 1);
goto repeat;
}
SetPageUptodate(page);
clear_cold_data(page);
return 0;
-
-err:
- f2fs_unlock_op(sbi);
- f2fs_put_page(page, 1);
- return err;
}
static int f2fs_write_end(struct file *file,
update_inode_page(inode);
}
- unlock_page(page);
- page_cache_release(page);
+ f2fs_put_page(page, 1);
return copied;
}
+static int check_direct_IO(struct inode *inode, int rw,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+{
+ unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
+ int i;
+
+ if (rw == READ)
+ return 0;
+
+ if (offset & blocksize_mask)
+ return -EINVAL;
+
+ for (i = 0; i < nr_segs; i++)
+ if (iov[i].iov_len & blocksize_mask)
+ return -EINVAL;
+ return 0;
+}
+
static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset, unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- if (rw == WRITE)
+ /* Let buffer I/O handle the inline data case. */
+ if (f2fs_has_inline_data(inode))
+ return 0;
+
+ if (check_direct_IO(inode, rw, iov, offset, nr_segs))
return 0;
- /* Needs synchronization with the cleaner */
return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- get_data_block_ro);
+ get_data_block);
}
static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
trace_f2fs_set_page_dirty(page, DATA);
SetPageUptodate(page);
+ mark_inode_dirty(inode);
+
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
set_dirty_dir_page(inode, page);
static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
{
- return generic_block_bmap(mapping, block, get_data_block_ro);
+ return generic_block_bmap(mapping, block, get_data_block);
}
const struct address_space_operations f2fs_dblock_aops = {
bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
clear_bit(GBF_FULL, &bi->bi_flags);
rgd->rd_free_clone = rgd->rd_free;
+ rgd->rd_extfail_pt = rgd->rd_free;
}
/**
nrvecs = max(nrvecs/2, 1U);
}
- bio->bi_sector = blkno * (sb->s_blocksize >> 9);
+ bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
bio->bi_bdev = sb->s_bdev;
bio->bi_end_io = gfs2_end_log_write;
bio->bi_private = sdp;
static void gfs2_meta_sync(struct gfs2_glock *gl)
{
struct address_space *mapping = gfs2_glock2aspace(gl);
+ struct gfs2_sbd *sdp = gl->gl_sbd;
int error;
+ if (mapping == NULL)
+ mapping = &sdp->sd_aspace;
+
filemap_fdatawrite(mapping);
error = filemap_fdatawait(mapping);
#include "log.h"
#include "quota.h"
#include "dir.h"
+#include "meta_io.h"
#include "trace_gfs2.h"
#define DO 0
static struct gfs2_sbd *init_sbd(struct super_block *sb)
{
struct gfs2_sbd *sdp;
+ struct address_space *mapping;
sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
if (!sdp)
init_waitqueue_head(&sdp->sd_quota_wait);
INIT_LIST_HEAD(&sdp->sd_trunc_list);
spin_lock_init(&sdp->sd_trunc_lock);
+ spin_lock_init(&sdp->sd_bitmap_lock);
+
+ mapping = &sdp->sd_aspace;
+
+ address_space_init_once(mapping);
+ mapping->a_ops = &gfs2_meta_aops;
+ mapping->host = sb->s_bdev->bd_inode;
+ mapping->flags = 0;
+ mapping_set_gfp_mask(mapping, GFP_NOFS);
+ mapping->private_data = NULL;
+ mapping->backing_dev_info = sb->s_bdi;
+ mapping->writeback_index = 0;
spin_lock_init(&sdp->sd_log_lock);
atomic_set(&sdp->sd_log_pinned, 0);
page = alloc_page(GFP_NOFS);
if (unlikely(!page))
- return -ENOBUFS;
+ return -ENOMEM;
ClearPageUptodate(page);
ClearPageDirty(page);
lock_page(page);
bio = bio_alloc(GFP_NOFS, 1);
- bio->bi_sector = sector * (sb->s_blocksize >> 9);
+ bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
bio->bi_bdev = sb->s_bdev;
bio_add_page(bio, page, PAGE_SIZE, 0);
return error;
}
-static int init_threads(struct gfs2_sbd *sdp, int undo)
-{
- struct task_struct *p;
- int error = 0;
-
- if (undo)
- goto fail_quotad;
-
- p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
- if (IS_ERR(p)) {
- error = PTR_ERR(p);
- fs_err(sdp, "can't start logd thread: %d\n", error);
- return error;
- }
- sdp->sd_logd_process = p;
-
- p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
- if (IS_ERR(p)) {
- error = PTR_ERR(p);
- fs_err(sdp, "can't start quotad thread: %d\n", error);
- goto fail;
- }
- sdp->sd_quotad_process = p;
-
- return 0;
-
-
-fail_quotad:
- kthread_stop(sdp->sd_quotad_process);
-fail:
- kthread_stop(sdp->sd_logd_process);
- return error;
-}
-
static const match_table_t nolock_tokens = {
{ Opt_jid, "jid=%d\n", },
{ Opt_err, NULL },
goto fail_per_node;
}
- error = init_threads(sdp, DO);
- if (error)
- goto fail_per_node;
-
if (!(sb->s_flags & MS_RDONLY)) {
error = gfs2_make_fs_rw(sdp);
if (error) {
fs_err(sdp, "can't make FS RW: %d\n", error);
- goto fail_threads;
+ goto fail_per_node;
}
}
gfs2_online_uevent(sdp);
return 0;
-fail_threads:
- init_threads(sdp, UNDO);
fail_per_node:
init_per_node(sdp, UNDO);
fail_inodes:
if (IS_ERR(s))
goto error_bdev;
- if (s->s_root)
+ if (s->s_root) {
+ /*
+ * s_umount nests inside bd_mutex during
+ * __invalidate_device(). blkdev_put() acquires
+ * bd_mutex and can't be called under s_umount. Drop
+ * s_umount temporarily. This is safe as we're
+ * holding an active reference.
+ */
+ up_write(&s->s_umount);
blkdev_put(bdev, mode);
+ down_write(&s->s_umount);
+ }
memset(&args, 0, sizeof(args));
args.ar_quota = GFS2_QUOTA_DEFAULT;
struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
ASSERT(bio->bi_private == NULL);
- bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
return bio;
}
lockmode = XFS_ILOCK_EXCL;
xfs_ilock(ip, lockmode);
} else {
- lockmode = xfs_ilock_map_shared(ip);
+ lockmode = xfs_ilock_data_map_shared(ip);
}
ASSERT(offset <= mp->m_super->s_maxbytes);
numbytes = BBTOB(numblks);
/* Check for IOs smaller than the sector size / not sector aligned */
- ASSERT(!(numbytes < (1 << btp->bt_sshift)));
- ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
+ ASSERT(!(numbytes < btp->bt_meta_sectorsize));
+ ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
/*
* Corrupted block numbers can get through to here, unfortunately, so we
bio = bio_alloc(GFP_NOIO, nr_pages);
bio->bi_bdev = bp->b_target->bt_bdev;
- bio->bi_sector = sector;
+ bio->bi_iter.bi_sector = sector;
bio->bi_end_io = xfs_buf_bio_end_io;
bio->bi_private = bp;
total_nr_pages--;
}
- if (likely(bio->bi_size)) {
+ if (likely(bio->bi_iter.bi_size)) {
if (xfs_buf_is_vmapped(bp)) {
flush_kernel_vmap_range(bp->b_addr,
xfs_buf_vmap_len(bp));
kmem_free(btp);
}
-STATIC int
-xfs_setsize_buftarg_flags(
+int
+xfs_setsize_buftarg(
xfs_buftarg_t *btp,
unsigned int blocksize,
- unsigned int sectorsize,
- int verbose)
+ unsigned int sectorsize)
{
- btp->bt_bsize = blocksize;
- btp->bt_sshift = ffs(sectorsize) - 1;
- btp->bt_smask = sectorsize - 1;
+ /* Set up metadata sector size info */
+ btp->bt_meta_sectorsize = sectorsize;
+ btp->bt_meta_sectormask = sectorsize - 1;
if (set_blocksize(btp->bt_bdev, sectorsize)) {
char name[BDEVNAME_SIZE];
return EINVAL;
}
+ /* Set up device logical sector size mask */
+ btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
+ btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
+
return 0;
}
/*
- * When allocating the initial buffer target we have not yet
- * read in the superblock, so don't know what sized sectors
- * are being used at this early stage. Play safe.
+ * When allocating the initial buffer target we have not yet
+ * read in the superblock, so don't know what sized sectors
+ * are being used at this early stage. Play safe.
*/
STATIC int
xfs_setsize_buftarg_early(
xfs_buftarg_t *btp,
struct block_device *bdev)
{
- return xfs_setsize_buftarg_flags(btp,
- PAGE_SIZE, bdev_logical_block_size(bdev), 0);
-}
-
-int
-xfs_setsize_buftarg(
- xfs_buftarg_t *btp,
- unsigned int blocksize,
- unsigned int sectorsize)
-{
- return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
+ return xfs_setsize_buftarg(btp, PAGE_SIZE,
+ bdev_logical_block_size(bdev));
}
xfs_buftarg_t *
#ifndef __FS_CEPH_MESSENGER_H
#define __FS_CEPH_MESSENGER_H
+ #include <linux/blk_types.h>
#include <linux/kref.h>
#include <linux/mutex.h>
#include <linux/net.h>
u32 global_seq;
spinlock_t global_seq_lock;
- u32 supported_features;
- u32 required_features;
+ u64 supported_features;
+ u64 required_features;
};
enum ceph_msg_data_type {
#ifdef CONFIG_BLOCK
struct { /* bio */
struct bio *bio; /* bio from list */
- unsigned int vector_index; /* vector from bio */
- unsigned int vector_offset; /* bytes from vector */
+ struct bvec_iter bvec_iter;
};
#endif /* CONFIG_BLOCK */
struct { /* pages */
struct list_head list_head; /* links for connection lists */
struct kref kref;
- bool front_is_vmalloc;
bool more_to_follow;
bool needs_out_seq;
- int front_max;
+ int front_alloc_len;
unsigned long ack_stamp; /* tx: when we were acked */
struct ceph_msgpool *pool;
struct ceph_entity_name peer_name; /* peer name */
- unsigned peer_features;
+ u64 peer_features;
u32 connect_seq; /* identify the most recent connection
attempt for this connection, client */
u32 peer_global_seq; /* peer's global seq for this connection */
extern void ceph_messenger_init(struct ceph_messenger *msgr,
struct ceph_entity_addr *myaddr,
- u32 supported_features,
- u32 required_features,
+ u64 supported_features,
+ u64 required_features,
bool nocrc);
extern void ceph_con_init(struct ceph_connection *con, void *private,
{ META, "META" }, \
{ META_FLUSH, "META_FLUSH" })
-#define show_bio_type(type) \
- __print_symbolic(type, \
- { READ, "READ" }, \
- { READA, "READAHEAD" }, \
- { READ_SYNC, "READ_SYNC" }, \
- { WRITE, "WRITE" }, \
- { WRITE_SYNC, "WRITE_SYNC" }, \
- { WRITE_FLUSH, "WRITE_FLUSH" }, \
- { WRITE_FUA, "WRITE_FUA" })
+#define F2FS_BIO_MASK(t) (t & (READA | WRITE_FLUSH_FUA))
+#define F2FS_BIO_EXTRA_MASK(t) (t & (REQ_META | REQ_PRIO))
+
+#define show_bio_type(type) show_bio_base(type), show_bio_extra(type)
+
+#define show_bio_base(type) \
+ __print_symbolic(F2FS_BIO_MASK(type), \
+ { READ, "READ" }, \
+ { READA, "READAHEAD" }, \
+ { READ_SYNC, "READ_SYNC" }, \
+ { WRITE, "WRITE" }, \
+ { WRITE_SYNC, "WRITE_SYNC" }, \
+ { WRITE_FLUSH, "WRITE_FLUSH" }, \
+ { WRITE_FUA, "WRITE_FUA" }, \
+ { WRITE_FLUSH_FUA, "WRITE_FLUSH_FUA" })
+
+#define show_bio_extra(type) \
+ __print_symbolic(F2FS_BIO_EXTRA_MASK(type), \
+ { REQ_META, "(M)" }, \
+ { REQ_PRIO, "(P)" }, \
+ { REQ_META | REQ_PRIO, "(MP)" }, \
+ { 0, " \b" })
#define show_data_type(type) \
__print_symbolic(type, \
__entry->err)
);
-TRACE_EVENT_CONDITION(f2fs_readpage,
+TRACE_EVENT_CONDITION(f2fs_submit_page_bio,
TP_PROTO(struct page *page, sector_t blkaddr, int type),
),
TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, "
- "blkaddr = 0x%llx, bio_type = %s",
+ "blkaddr = 0x%llx, bio_type = %s%s",
show_dev_ino(__entry),
(unsigned long)__entry->index,
(unsigned long long)__entry->blkaddr,
__entry->ofs_in_node)
);
-TRACE_EVENT(f2fs_do_submit_bio,
+DECLARE_EVENT_CLASS(f2fs__submit_bio,
- TP_PROTO(struct super_block *sb, int btype, bool sync, struct bio *bio),
+ TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio),
- TP_ARGS(sb, btype, sync, bio),
+ TP_ARGS(sb, rw, type, bio),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(int, btype)
- __field(bool, sync)
+ __field(int, rw)
+ __field(int, type)
__field(sector_t, sector)
__field(unsigned int, size)
),
TP_fast_assign(
__entry->dev = sb->s_dev;
- __entry->btype = btype;
- __entry->sync = sync;
+ __entry->rw = rw;
+ __entry->type = type;
- __entry->sector = bio->bi_sector;
- __entry->size = bio->bi_size;
+ __entry->sector = bio->bi_iter.bi_sector;
+ __entry->size = bio->bi_iter.bi_size;
),
- TP_printk("dev = (%d,%d), type = %s, io = %s, sector = %lld, size = %u",
+ TP_printk("dev = (%d,%d), %s%s, %s, sector = %lld, size = %u",
show_dev(__entry),
- show_block_type(__entry->btype),
- __entry->sync ? "sync" : "no sync",
+ show_bio_type(__entry->rw),
+ show_block_type(__entry->type),
(unsigned long long)__entry->sector,
__entry->size)
);
+DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_write_bio,
+
+ TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio),
+
+ TP_ARGS(sb, rw, type, bio),
+
+ TP_CONDITION(bio)
+);
+
+DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_read_bio,
+
+ TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio),
+
+ TP_ARGS(sb, rw, type, bio),
+
+ TP_CONDITION(bio)
+);
+
DECLARE_EVENT_CLASS(f2fs__page,
TP_PROTO(struct page *page, int type),
TP_ARGS(page, type)
);
-TRACE_EVENT(f2fs_submit_write_page,
+TRACE_EVENT(f2fs_submit_page_mbio,
- TP_PROTO(struct page *page, block_t blk_addr, int type),
+ TP_PROTO(struct page *page, int rw, int type, block_t blk_addr),
- TP_ARGS(page, blk_addr, type),
+ TP_ARGS(page, rw, type, blk_addr),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(ino_t, ino)
+ __field(int, rw)
__field(int, type)
__field(pgoff_t, index)
__field(block_t, block)
TP_fast_assign(
__entry->dev = page->mapping->host->i_sb->s_dev;
__entry->ino = page->mapping->host->i_ino;
+ __entry->rw = rw;
__entry->type = type;
__entry->index = page->index;
__entry->block = blk_addr;
),
- TP_printk("dev = (%d,%d), ino = %lu, %s, index = %lu, blkaddr = 0x%llx",
+ TP_printk("dev = (%d,%d), ino = %lu, %s%s, %s, index = %lu, blkaddr = 0x%llx",
show_dev_ino(__entry),
+ show_bio_type(__entry->rw),
show_block_type(__entry->type),
(unsigned long)__entry->index,
(unsigned long long)__entry->block)
__entry->msg)
);
+TRACE_EVENT(f2fs_issue_discard,
+
+ TP_PROTO(struct super_block *sb, block_t blkstart, block_t blklen),
+
+ TP_ARGS(sb, blkstart, blklen),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(block_t, blkstart)
+ __field(block_t, blklen)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = sb->s_dev;
+ __entry->blkstart = blkstart;
+ __entry->blklen = blklen;
+ ),
+
+ TP_printk("dev = (%d,%d), blkstart = 0x%llx, blklen = 0x%llx",
+ show_dev(__entry),
+ (unsigned long long)__entry->blkstart,
+ (unsigned long long)__entry->blklen)
+);
#endif /* _TRACE_F2FS_H */
/* This part must be outside protection */
bio = bio_alloc(gfp_flags, 1);
if (bio) {
- bio->bi_sector = map_swap_page(page, &bio->bi_bdev);
- bio->bi_sector <<= PAGE_SHIFT - 9;
+ bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
+ bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
bio->bi_io_vec[0].bv_page = page;
bio->bi_io_vec[0].bv_len = PAGE_SIZE;
bio->bi_io_vec[0].bv_offset = 0;
bio->bi_vcnt = 1;
- bio->bi_size = PAGE_SIZE;
+ bio->bi_iter.bi_size = PAGE_SIZE;
bio->bi_end_io = end_io;
}
return bio;
printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_sector);
+ (unsigned long long)bio->bi_iter.bi_sector);
ClearPageReclaim(page);
}
end_page_writeback(page);
printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_sector);
+ (unsigned long long)bio->bi_iter.bi_sector);
goto out;
}
int ret = 0;
struct swap_info_struct *sis = page_swap_info(page);
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(PageUptodate(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageUptodate(page), page);
if (frontswap_load(page) == 0) {
SetPageUptodate(page);
unlock_page(page);
#include <linux/dns_resolver.h>
#include <net/tcp.h>
+#include <linux/ceph/ceph_features.h>
#include <linux/ceph/libceph.h>
#include <linux/ceph/messenger.h>
#include <linux/ceph/decode.h>
bio = data->bio;
BUG_ON(!bio);
- BUG_ON(!bio->bi_vcnt);
cursor->resid = min(length, data->bio_length);
cursor->bio = bio;
- cursor->vector_index = 0;
- cursor->vector_offset = 0;
- cursor->last_piece = length <= bio->bi_io_vec[0].bv_len;
+ cursor->bvec_iter = bio->bi_iter;
+ cursor->last_piece =
+ cursor->resid <= bio_iter_len(bio, cursor->bvec_iter);
}
static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
{
struct ceph_msg_data *data = cursor->data;
struct bio *bio;
- struct bio_vec *bio_vec;
- unsigned int index;
+ struct bio_vec bio_vec;
BUG_ON(data->type != CEPH_MSG_DATA_BIO);
bio = cursor->bio;
BUG_ON(!bio);
- index = cursor->vector_index;
- BUG_ON(index >= (unsigned int) bio->bi_vcnt);
+ bio_vec = bio_iter_iovec(bio, cursor->bvec_iter);
- bio_vec = &bio->bi_io_vec[index];
- BUG_ON(cursor->vector_offset >= bio_vec->bv_len);
- *page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset);
+ *page_offset = (size_t) bio_vec.bv_offset;
BUG_ON(*page_offset >= PAGE_SIZE);
if (cursor->last_piece) /* pagelist offset is always 0 */
*length = cursor->resid;
else
- *length = (size_t) (bio_vec->bv_len - cursor->vector_offset);
+ *length = (size_t) bio_vec.bv_len;
BUG_ON(*length > cursor->resid);
BUG_ON(*page_offset + *length > PAGE_SIZE);
- return bio_vec->bv_page;
+ return bio_vec.bv_page;
}
static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
size_t bytes)
{
struct bio *bio;
- struct bio_vec *bio_vec;
- unsigned int index;
+ struct bio_vec bio_vec;
BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO);
bio = cursor->bio;
BUG_ON(!bio);
- index = cursor->vector_index;
- BUG_ON(index >= (unsigned int) bio->bi_vcnt);
- bio_vec = &bio->bi_io_vec[index];
+ bio_vec = bio_iter_iovec(bio, cursor->bvec_iter);
/* Advance the cursor offset */
BUG_ON(cursor->resid < bytes);
cursor->resid -= bytes;
- cursor->vector_offset += bytes;
- if (cursor->vector_offset < bio_vec->bv_len)
+
+ bio_advance_iter(bio, &cursor->bvec_iter, bytes);
+
+ if (bytes < bio_vec.bv_len)
return false; /* more bytes to process in this segment */
- BUG_ON(cursor->vector_offset != bio_vec->bv_len);
/* Move on to the next segment, and possibly the next bio */
- if (++index == (unsigned int) bio->bi_vcnt) {
+ if (!cursor->bvec_iter.bi_size) {
bio = bio->bi_next;
- index = 0;
+ cursor->bvec_iter = bio->bi_iter;
}
cursor->bio = bio;
- cursor->vector_index = index;
- cursor->vector_offset = 0;
if (!cursor->last_piece) {
BUG_ON(!cursor->resid);
BUG_ON(!bio);
/* A short read is OK, so use <= rather than == */
- if (cursor->resid <= bio->bi_io_vec[index].bv_len)
+ if (cursor->resid <= bio_iter_len(bio, cursor->bvec_iter))
cursor->last_piece = true;
}
port = (port * 10) + (*p - '0');
p++;
}
- if (port > 65535 || port == 0)
+ if (port == 0)
+ port = CEPH_MON_PORT;
+ else if (port > 65535)
goto bad;
} else {
port = CEPH_MON_PORT;
{
u64 sup_feat = con->msgr->supported_features;
u64 req_feat = con->msgr->required_features;
- u64 server_feat = le64_to_cpu(con->in_reply.features);
+ u64 server_feat = ceph_sanitize_features(
+ le64_to_cpu(con->in_reply.features));
int ret;
dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
*/
void ceph_messenger_init(struct ceph_messenger *msgr,
struct ceph_entity_addr *myaddr,
- u32 supported_features,
- u32 required_features,
+ u64 supported_features,
+ u64 required_features,
bool nocrc)
{
msgr->supported_features = supported_features;
INIT_LIST_HEAD(&m->data);
/* front */
- m->front_max = front_len;
if (front_len) {
- if (front_len > PAGE_CACHE_SIZE) {
- m->front.iov_base = __vmalloc(front_len, flags,
- PAGE_KERNEL);
- m->front_is_vmalloc = true;
- } else {
- m->front.iov_base = kmalloc(front_len, flags);
- }
+ m->front.iov_base = ceph_kvmalloc(front_len, flags);
if (m->front.iov_base == NULL) {
dout("ceph_msg_new can't allocate %d bytes\n",
front_len);
} else {
m->front.iov_base = NULL;
}
- m->front.iov_len = front_len;
+ m->front_alloc_len = m->front.iov_len = front_len;
dout("ceph_msg_new %p front %d\n", m, front_len);
return m;
void ceph_msg_kfree(struct ceph_msg *m)
{
dout("msg_kfree %p\n", m);
- if (m->front_is_vmalloc)
- vfree(m->front.iov_base);
- else
- kfree(m->front.iov_base);
+ ceph_kvfree(m->front.iov_base);
kmem_cache_free(ceph_msg_cache, m);
}
void ceph_msg_dump(struct ceph_msg *msg)
{
- pr_debug("msg_dump %p (front_max %d length %zd)\n", msg,
- msg->front_max, msg->data_length);
+ pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg,
+ msg->front_alloc_len, msg->data_length);
print_hex_dump(KERN_DEBUG, "header: ",
DUMP_PREFIX_OFFSET, 16, 1,
&msg->hdr, sizeof(msg->hdr), true);