From: Linus Torvalds Date: Thu, 30 Jan 2014 19:19:05 +0000 (-0800) Subject: Merge branch 'for-3.14/core' of git://git.kernel.dk/linux-block X-Git-Url: http://pileus.org/git/?p=~andy%2Flinux;a=commitdiff_plain;h=f568849edac8611d603e00bd6cbbcfea09395ae6;hp=-c Merge branch 'for-3.14/core' of git://git.kernel.dk/linux-block Pull core block IO changes from Jens Axboe: "The major piece in here is the immutable bio_ve series from Kent, the rest is fairly minor. It was supposed to go in last round, but various issues pushed it to this release instead. The pull request contains: - Various smaller blk-mq fixes from different folks. Nothing major here, just minor fixes and cleanups. - Fix for a memory leak in the error path in the block ioctl code from Christian Engelmayer. - Header export fix from CaiZhiyong. - Finally the immutable biovec changes from Kent Overstreet. This enables some nice future work on making arbitrarily sized bios possible, and splitting more efficient. Related fixes to immutable bio_vecs: - dm-cache immutable fixup from Mike Snitzer. - btrfs immutable fixup from Muthu Kumar. - bio-integrity fix from Nic Bellinger, which is also going to stable" * 'for-3.14/core' of git://git.kernel.dk/linux-block: (44 commits) xtensa: fixup simdisk driver to work with immutable bio_vecs block/blk-mq-cpu.c: use hotcpu_notifier() blk-mq: for_each_* macro correctness block: Fix memory leak in rw_copy_check_uvector() handling bio-integrity: Fix bio_integrity_verify segment start bug block: remove unrelated header files and export symbol blk-mq: uses page->list incorrectly blk-mq: use __smp_call_function_single directly btrfs: fix missing increment of bi_remaining Revert "block: Warn and free bio if bi_end_io is not set" block: Warn and free bio if bi_end_io is not set blk-mq: fix initializing request's start time block: blk-mq: don't export blk_mq_free_queue() block: blk-mq: make blk_sync_queue support mq block: blk-mq: support draining mq queue dm cache: increment bi_remaining when bi_end_io is restored block: fixup for generic bio chaining block: Really silence spurious compiler warnings block: Silence spurious compiler warnings block: Kill bio_pair_split() ... --- f568849edac8611d603e00bd6cbbcfea09395ae6 diff --combined block/blk-throttle.c index a760857e6b6,20f82003777..1474c3ab7e7 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@@ -877,14 -877,14 +877,14 @@@ static bool tg_with_in_bps_limit(struc do_div(tmp, HZ); bytes_allowed = tmp; - if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { + if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) { if (wait) *wait = 0; return 1; } /* Calc approx time to dispatch */ - extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; + extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed; jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); if (!jiffy_wait) @@@ -987,7 -987,7 +987,7 @@@ static void throtl_charge_bio(struct th bool rw = bio_data_dir(bio); /* Charge the bio to the group */ - tg->bytes_disp[rw] += bio->bi_size; + tg->bytes_disp[rw] += bio->bi_iter.bi_size; tg->io_disp[rw]++; /* @@@ -1003,8 -1003,8 +1003,8 @@@ */ if (!(bio->bi_rw & REQ_THROTTLED)) { bio->bi_rw |= REQ_THROTTLED; - throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, - bio->bi_rw); + throtl_update_dispatch_stats(tg_to_blkg(tg), + bio->bi_iter.bi_size, bio->bi_rw); } } @@@ -1303,10 -1303,13 +1303,10 @@@ static u64 tg_prfill_cpu_rwstat(struct return __blkg_prfill_rwstat(sf, pd, &rwstat); } -static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int tg_print_cpu_rwstat(struct seq_file *sf, void *v) { - struct blkcg *blkcg = css_to_blkcg(css); - - blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, - cft->private, true); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat, + &blkcg_policy_throtl, seq_cft(sf)->private, true); return 0; } @@@ -1332,17 -1335,19 +1332,17 @@@ static u64 tg_prfill_conf_uint(struct s return __blkg_prfill_u64(sf, pd, v); } -static int tg_print_conf_u64(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int tg_print_conf_u64(struct seq_file *sf, void *v) { - blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64, - &blkcg_policy_throtl, cft->private, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, + &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } -static int tg_print_conf_uint(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int tg_print_conf_uint(struct seq_file *sf, void *v) { - blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint, - &blkcg_policy_throtl, cft->private, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, + &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } @@@ -1423,40 -1428,40 +1423,40 @@@ static struct cftype throtl_files[] = { .name = "throttle.read_bps_device", .private = offsetof(struct throtl_grp, bps[READ]), - .read_seq_string = tg_print_conf_u64, + .seq_show = tg_print_conf_u64, .write_string = tg_set_conf_u64, .max_write_len = 256, }, { .name = "throttle.write_bps_device", .private = offsetof(struct throtl_grp, bps[WRITE]), - .read_seq_string = tg_print_conf_u64, + .seq_show = tg_print_conf_u64, .write_string = tg_set_conf_u64, .max_write_len = 256, }, { .name = "throttle.read_iops_device", .private = offsetof(struct throtl_grp, iops[READ]), - .read_seq_string = tg_print_conf_uint, + .seq_show = tg_print_conf_uint, .write_string = tg_set_conf_uint, .max_write_len = 256, }, { .name = "throttle.write_iops_device", .private = offsetof(struct throtl_grp, iops[WRITE]), - .read_seq_string = tg_print_conf_uint, + .seq_show = tg_print_conf_uint, .write_string = tg_set_conf_uint, .max_write_len = 256, }, { .name = "throttle.io_service_bytes", .private = offsetof(struct tg_stats_cpu, service_bytes), - .read_seq_string = tg_print_cpu_rwstat, + .seq_show = tg_print_cpu_rwstat, }, { .name = "throttle.io_serviced", .private = offsetof(struct tg_stats_cpu, serviced), - .read_seq_string = tg_print_cpu_rwstat, + .seq_show = tg_print_cpu_rwstat, }, { } /* terminate */ }; @@@ -1503,7 -1508,7 +1503,7 @@@ bool blk_throtl_bio(struct request_queu if (tg) { if (!tg->has_rules[rw]) { throtl_update_dispatch_stats(tg_to_blkg(tg), - bio->bi_size, bio->bi_rw); + bio->bi_iter.bi_size, bio->bi_rw); goto out_unlock_rcu; } } @@@ -1559,7 -1564,7 +1559,7 @@@ /* out-of-limit, queue to @tg */ throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", rw == READ ? 'R' : 'W', - tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], + tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw], tg->io_disp[rw], tg->iops[rw], sq->nr_queued[READ], sq->nr_queued[WRITE]); diff --combined drivers/block/rbd.c index 16cab663516,3624368b910..b365e0dfccb --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@@ -41,7 -41,6 +41,7 @@@ #include #include #include +#include #include "rbd_types.h" @@@ -90,9 -89,9 +90,9 @@@ static int atomic_dec_return_safe(atomi } #define RBD_DRV_NAME "rbd" -#define RBD_DRV_NAME_LONG "rbd (rados block device)" -#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ +#define RBD_MINORS_PER_MAJOR 256 +#define RBD_SINGLE_MAJOR_PART_SHIFT 4 #define RBD_SNAP_DEV_NAME_PREFIX "snap_" #define RBD_MAX_SNAP_NAME_LEN \ @@@ -324,7 -323,6 +324,7 @@@ struct rbd_device int dev_id; /* blkdev unique id */ int major; /* blkdev assigned major */ + int minor; struct gendisk *disk; /* blkdev's gendisk and rq */ u32 image_format; /* Either 1 or 2 */ @@@ -388,17 -386,6 +388,17 @@@ static struct kmem_cache *rbd_img_reque static struct kmem_cache *rbd_obj_request_cache; static struct kmem_cache *rbd_segment_name_cache; +static int rbd_major; +static DEFINE_IDA(rbd_dev_id_ida); + +/* + * Default to false for now, as single-major requires >= 0.75 version of + * userspace rbd utility. + */ +static bool single_major = false; +module_param(single_major, bool, S_IRUGO); +MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); + static int rbd_img_request_submit(struct rbd_img_request *img_request); static void rbd_dev_device_release(struct device *dev); @@@ -407,52 -394,18 +407,52 @@@ static ssize_t rbd_add(struct bus_type size_t count); static ssize_t rbd_remove(struct bus_type *bus, const char *buf, size_t count); +static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, + size_t count); +static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, + size_t count); static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping); static void rbd_spec_put(struct rbd_spec *spec); +static int rbd_dev_id_to_minor(int dev_id) +{ + return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; +} + +static int minor_to_rbd_dev_id(int minor) +{ + return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; +} + static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); +static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); +static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); static struct attribute *rbd_bus_attrs[] = { &bus_attr_add.attr, &bus_attr_remove.attr, + &bus_attr_add_single_major.attr, + &bus_attr_remove_single_major.attr, NULL, }; -ATTRIBUTE_GROUPS(rbd_bus); + +static umode_t rbd_bus_is_visible(struct kobject *kobj, + struct attribute *attr, int index) +{ + if (!single_major && + (attr == &bus_attr_add_single_major.attr || + attr == &bus_attr_remove_single_major.attr)) + return 0; + + return attr->mode; +} + +static const struct attribute_group rbd_bus_group = { + .attrs = rbd_bus_attrs, + .is_visible = rbd_bus_is_visible, +}; +__ATTRIBUTE_GROUPS(rbd_bus); static struct bus_type rbd_bus_type = { .name = "rbd", @@@ -1088,9 -1041,9 +1088,9 @@@ static const char *rbd_segment_name(str name_format = "%s.%012llx"; if (rbd_dev->image_format == 2) name_format = "%s.%016llx"; - ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format, + ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, rbd_dev->header.object_prefix, segment); - if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { + if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { pr_err("error formatting segment name for #%llu (%d)\n", segment, ret); kfree(name); @@@ -1156,23 -1109,23 +1156,23 @@@ static void bio_chain_put(struct bio *c */ static void zero_bio_chain(struct bio *chain, int start_ofs) { - struct bio_vec *bv; + struct bio_vec bv; + struct bvec_iter iter; unsigned long flags; void *buf; - int i; int pos = 0; while (chain) { - bio_for_each_segment(bv, chain, i) { - if (pos + bv->bv_len > start_ofs) { + bio_for_each_segment(bv, chain, iter) { + if (pos + bv.bv_len > start_ofs) { int remainder = max(start_ofs - pos, 0); - buf = bvec_kmap_irq(bv, &flags); + buf = bvec_kmap_irq(&bv, &flags); memset(buf + remainder, 0, - bv->bv_len - remainder); - flush_dcache_page(bv->bv_page); + bv.bv_len - remainder); + flush_dcache_page(bv.bv_page); bvec_kunmap_irq(buf, &flags); } - pos += bv->bv_len; + pos += bv.bv_len; } chain = chain->bi_next; @@@ -1220,74 -1173,14 +1220,14 @@@ static struct bio *bio_clone_range(stru unsigned int len, gfp_t gfpmask) { - struct bio_vec *bv; - unsigned int resid; - unsigned short idx; - unsigned int voff; - unsigned short end_idx; - unsigned short vcnt; struct bio *bio; - /* Handle the easy case for the caller */ - - if (!offset && len == bio_src->bi_size) - return bio_clone(bio_src, gfpmask); - - if (WARN_ON_ONCE(!len)) - return NULL; - if (WARN_ON_ONCE(len > bio_src->bi_size)) - return NULL; - if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) - return NULL; - - /* Find first affected segment... */ - - resid = offset; - bio_for_each_segment(bv, bio_src, idx) { - if (resid < bv->bv_len) - break; - resid -= bv->bv_len; - } - voff = resid; - - /* ...and the last affected segment */ - - resid += len; - __bio_for_each_segment(bv, bio_src, end_idx, idx) { - if (resid <= bv->bv_len) - break; - resid -= bv->bv_len; - } - vcnt = end_idx - idx + 1; - - /* Build the clone */ - - bio = bio_alloc(gfpmask, (unsigned int) vcnt); + bio = bio_clone(bio_src, gfpmask); if (!bio) return NULL; /* ENOMEM */ - bio->bi_bdev = bio_src->bi_bdev; - bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); - bio->bi_rw = bio_src->bi_rw; - bio->bi_flags |= 1 << BIO_CLONED; - - /* - * Copy over our part of the bio_vec, then update the first - * and last (or only) entries. - */ - memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], - vcnt * sizeof (struct bio_vec)); - bio->bi_io_vec[0].bv_offset += voff; - if (vcnt > 1) { - bio->bi_io_vec[0].bv_len -= voff; - bio->bi_io_vec[vcnt - 1].bv_len = resid; - } else { - bio->bi_io_vec[0].bv_len = len; - } - - bio->bi_vcnt = vcnt; - bio->bi_size = len; - bio->bi_idx = 0; + bio_advance(bio, offset); + bio->bi_iter.bi_size = len; return bio; } @@@ -1318,7 -1211,7 +1258,7 @@@ static struct bio *bio_chain_clone_rang /* Build up a chain of clone bios up to the limit */ - if (!bi || off >= bi->bi_size || !len) + if (!bi || off >= bi->bi_iter.bi_size || !len) return NULL; /* Nothing to clone */ end = &chain; @@@ -1330,7 -1223,7 +1270,7 @@@ rbd_warn(NULL, "bio_chain exhausted with %u left", len); goto out_err; /* EINVAL; ran out of bio's */ } - bi_size = min_t(unsigned int, bi->bi_size - off, len); + bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len); bio = bio_clone_range(bi, off, bi_size, gfpmask); if (!bio) goto out_err; /* ENOMEM */ @@@ -1339,7 -1232,7 +1279,7 @@@ end = &bio->bi_next; off += bi_size; - if (off == bi->bi_size) { + if (off == bi->bi_iter.bi_size) { bi = bi->bi_next; off = 0; } @@@ -1808,8 -1701,11 +1748,8 @@@ static struct ceph_osd_request *rbd_osd osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; - osd_req->r_oid_len = strlen(obj_request->object_name); - rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); - memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); - - osd_req->r_file_layout = rbd_dev->layout; /* struct */ + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); return osd_req; } @@@ -1846,8 -1742,11 +1786,8 @@@ rbd_osd_req_create_copyup(struct rbd_ob osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; - osd_req->r_oid_len = strlen(obj_request->object_name); - rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); - memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); - - osd_req->r_file_layout = rbd_dev->layout; /* struct */ + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); return osd_req; } @@@ -2227,7 -2126,8 +2167,8 @@@ static int rbd_img_request_fill(struct if (type == OBJ_REQUEST_BIO) { bio_list = data_desc; - rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT); + rbd_assert(img_offset == + bio_list->bi_iter.bi_sector << SECTOR_SHIFT); } else { rbd_assert(type == OBJ_REQUEST_PAGES); pages = data_desc; @@@ -2907,7 -2807,7 +2848,7 @@@ static void rbd_watch_cb(u64 ver, u64 n * Request sync osd watch/unwatch. The value of "start" determines * whether a watch request is being initiated or torn down. */ -static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) +static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; @@@ -2982,22 -2882,6 +2923,22 @@@ out_cancel return ret; } +static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) +{ + return __rbd_dev_header_watch_sync(rbd_dev, true); +} + +static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) +{ + int ret; + + ret = __rbd_dev_header_watch_sync(rbd_dev, false); + if (ret) { + rbd_warn(rbd_dev, "unable to tear down watch request: %d\n", + ret); + } +} + /* * Synchronous osd object method call. Returns the number of bytes * returned in the outbound buffer, or a negative error code. @@@ -3445,18 -3329,14 +3386,18 @@@ static int rbd_init_disk(struct rbd_dev u64 segment_size; /* create gendisk info */ - disk = alloc_disk(RBD_MINORS_PER_MAJOR); + disk = alloc_disk(single_major ? + (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : + RBD_MINORS_PER_MAJOR); if (!disk) return -ENOMEM; snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", rbd_dev->dev_id); disk->major = rbd_dev->major; - disk->first_minor = 0; + disk->first_minor = rbd_dev->minor; + if (single_major) + disk->flags |= GENHD_FL_EXT_DEVT; disk->fops = &rbd_bd_ops; disk->private_data = rbd_dev; @@@ -3528,14 -3408,7 +3469,14 @@@ static ssize_t rbd_major_show(struct de return sprintf(buf, "%d\n", rbd_dev->major); return sprintf(buf, "(none)\n"); +} +static ssize_t rbd_minor_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + return sprintf(buf, "%d\n", rbd_dev->minor); } static ssize_t rbd_client_id_show(struct device *dev, @@@ -3657,7 -3530,6 +3598,7 @@@ static ssize_t rbd_image_refresh(struc static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); +static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL); static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); @@@ -3671,7 -3543,6 +3612,7 @@@ static struct attribute *rbd_attrs[] = &dev_attr_size.attr, &dev_attr_features.attr, &dev_attr_major.attr, + &dev_attr_minor.attr, &dev_attr_client_id.attr, &dev_attr_pool.attr, &dev_attr_pool_id.attr, @@@ -4442,29 -4313,21 +4383,29 @@@ static void rbd_bus_del_dev(struct rbd_ device_unregister(&rbd_dev->dev); } -static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); - /* * Get a unique rbd identifier for the given new rbd_dev, and add - * the rbd_dev to the global list. The minimum rbd id is 1. + * the rbd_dev to the global list. */ -static void rbd_dev_id_get(struct rbd_device *rbd_dev) +static int rbd_dev_id_get(struct rbd_device *rbd_dev) { - rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); + int new_dev_id; + + new_dev_id = ida_simple_get(&rbd_dev_id_ida, + 0, minor_to_rbd_dev_id(1 << MINORBITS), + GFP_KERNEL); + if (new_dev_id < 0) + return new_dev_id; + + rbd_dev->dev_id = new_dev_id; spin_lock(&rbd_dev_list_lock); list_add_tail(&rbd_dev->node, &rbd_dev_list); spin_unlock(&rbd_dev_list_lock); - dout("rbd_dev %p given dev id %llu\n", rbd_dev, - (unsigned long long) rbd_dev->dev_id); + + dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id); + + return 0; } /* @@@ -4473,13 -4336,49 +4414,13 @@@ */ static void rbd_dev_id_put(struct rbd_device *rbd_dev) { - struct list_head *tmp; - int rbd_id = rbd_dev->dev_id; - int max_id; - - rbd_assert(rbd_id > 0); - - dout("rbd_dev %p released dev id %llu\n", rbd_dev, - (unsigned long long) rbd_dev->dev_id); spin_lock(&rbd_dev_list_lock); list_del_init(&rbd_dev->node); - - /* - * If the id being "put" is not the current maximum, there - * is nothing special we need to do. - */ - if (rbd_id != atomic64_read(&rbd_dev_id_max)) { - spin_unlock(&rbd_dev_list_lock); - return; - } - - /* - * We need to update the current maximum id. Search the - * list to find out what it is. We're more likely to find - * the maximum at the end, so search the list backward. - */ - max_id = 0; - list_for_each_prev(tmp, &rbd_dev_list) { - struct rbd_device *rbd_dev; - - rbd_dev = list_entry(tmp, struct rbd_device, node); - if (rbd_dev->dev_id > max_id) - max_id = rbd_dev->dev_id; - } spin_unlock(&rbd_dev_list_lock); - /* - * The max id could have been updated by rbd_dev_id_get(), in - * which case it now accurately reflects the new maximum. - * Be careful not to overwrite the maximum value in that - * case. - */ - atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); - dout(" max dev id has been reset\n"); + ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); + + dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id); } /* @@@ -4902,29 -4801,20 +4843,29 @@@ static int rbd_dev_device_setup(struct { int ret; - /* generate unique id: find highest unique id, add one */ - rbd_dev_id_get(rbd_dev); + /* Get an id and fill in device name. */ + + ret = rbd_dev_id_get(rbd_dev); + if (ret) + return ret; - /* Fill in the device name, now that we have its id. */ BUILD_BUG_ON(DEV_NAME_LEN < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); - /* Get our block major device number. */ + /* Record our major and minor device numbers. */ - ret = register_blkdev(0, rbd_dev->name); - if (ret < 0) - goto err_out_id; - rbd_dev->major = ret; + if (!single_major) { + ret = register_blkdev(0, rbd_dev->name); + if (ret < 0) + goto err_out_id; + + rbd_dev->major = ret; + rbd_dev->minor = 0; + } else { + rbd_dev->major = rbd_major; + rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); + } /* Set up the blkdev mapping. */ @@@ -4956,8 -4846,7 +4897,8 @@@ err_out_mapping err_out_disk: rbd_free_disk(rbd_dev); err_out_blkdev: - unregister_blkdev(rbd_dev->major, rbd_dev->name); + if (!single_major) + unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_id: rbd_dev_id_put(rbd_dev); rbd_dev_mapping_clear(rbd_dev); @@@ -5013,6 -4902,7 +4954,6 @@@ static void rbd_dev_image_release(struc static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) { int ret; - int tmp; /* * Get the id from the image id object. Unless there's an @@@ -5031,7 -4921,7 +4972,7 @@@ goto err_out_format; if (mapping) { - ret = rbd_dev_header_watch_sync(rbd_dev, true); + ret = rbd_dev_header_watch_sync(rbd_dev); if (ret) goto out_header_name; } @@@ -5058,8 -4948,12 +4999,8 @@@ err_out_probe: rbd_dev_unprobe(rbd_dev); err_out_watch: - if (mapping) { - tmp = rbd_dev_header_watch_sync(rbd_dev, false); - if (tmp) - rbd_warn(rbd_dev, "unable to tear down " - "watch request (%d)\n", tmp); - } + if (mapping) + rbd_dev_header_unwatch_sync(rbd_dev); out_header_name: kfree(rbd_dev->header_name); rbd_dev->header_name = NULL; @@@ -5073,9 -4967,9 +5014,9 @@@ err_out_format return ret; } -static ssize_t rbd_add(struct bus_type *bus, - const char *buf, - size_t count) +static ssize_t do_rbd_add(struct bus_type *bus, + const char *buf, + size_t count) { struct rbd_device *rbd_dev = NULL; struct ceph_options *ceph_opts = NULL; @@@ -5137,12 -5031,6 +5078,12 @@@ rc = rbd_dev_device_setup(rbd_dev); if (rc) { + /* + * rbd_dev_header_unwatch_sync() can't be moved into + * rbd_dev_image_release() without refactoring, see + * commit 1f3ef78861ac. + */ + rbd_dev_header_unwatch_sync(rbd_dev); rbd_dev_image_release(rbd_dev); goto err_out_module; } @@@ -5163,23 -5051,6 +5104,23 @@@ err_out_module return (ssize_t)rc; } +static ssize_t rbd_add(struct bus_type *bus, + const char *buf, + size_t count) +{ + if (single_major) + return -EINVAL; + + return do_rbd_add(bus, buf, count); +} + +static ssize_t rbd_add_single_major(struct bus_type *bus, + const char *buf, + size_t count) +{ + return do_rbd_add(bus, buf, count); +} + static void rbd_dev_device_release(struct device *dev) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); @@@ -5187,8 -5058,8 +5128,8 @@@ rbd_free_disk(rbd_dev); clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); rbd_dev_mapping_clear(rbd_dev); - unregister_blkdev(rbd_dev->major, rbd_dev->name); - rbd_dev->major = 0; + if (!single_major) + unregister_blkdev(rbd_dev->major, rbd_dev->name); rbd_dev_id_put(rbd_dev); rbd_dev_mapping_clear(rbd_dev); } @@@ -5219,9 -5090,9 +5160,9 @@@ static void rbd_dev_remove_parent(struc } } -static ssize_t rbd_remove(struct bus_type *bus, - const char *buf, - size_t count) +static ssize_t do_rbd_remove(struct bus_type *bus, + const char *buf, + size_t count) { struct rbd_device *rbd_dev = NULL; struct list_head *tmp; @@@ -5261,14 -5132,16 +5202,14 @@@ if (ret < 0 || already) return ret; - ret = rbd_dev_header_watch_sync(rbd_dev, false); - if (ret) - rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); - + rbd_dev_header_unwatch_sync(rbd_dev); /* * flush remaining watch callbacks - these must be complete * before the osd_client is shutdown */ dout("%s: flushing notifies", __func__); ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); + /* * Don't free anything from rbd_dev->disk until after all * notifies are completely processed. Otherwise @@@ -5282,23 -5155,6 +5223,23 @@@ return count; } +static ssize_t rbd_remove(struct bus_type *bus, + const char *buf, + size_t count) +{ + if (single_major) + return -EINVAL; + + return do_rbd_remove(bus, buf, count); +} + +static ssize_t rbd_remove_single_major(struct bus_type *bus, + const char *buf, + size_t count) +{ + return do_rbd_remove(bus, buf, count); +} + /* * create control files in sysfs * /sys/bus/rbd/... @@@ -5344,7 -5200,7 +5285,7 @@@ static int rbd_slab_init(void rbd_assert(!rbd_segment_name_cache); rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", - MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL); + CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); if (rbd_segment_name_cache) return 0; out_err: @@@ -5380,45 -5236,24 +5321,45 @@@ static int __init rbd_init(void if (!libceph_compatible(NULL)) { rbd_warn(NULL, "libceph incompatibility (quitting)"); - return -EINVAL; } + rc = rbd_slab_init(); if (rc) return rc; + + if (single_major) { + rbd_major = register_blkdev(0, RBD_DRV_NAME); + if (rbd_major < 0) { + rc = rbd_major; + goto err_out_slab; + } + } + rc = rbd_sysfs_init(); if (rc) - rbd_slab_exit(); + goto err_out_blkdev; + + if (single_major) + pr_info("loaded (major %d)\n", rbd_major); else - pr_info("loaded " RBD_DRV_NAME_LONG "\n"); + pr_info("loaded\n"); + + return 0; +err_out_blkdev: + if (single_major) + unregister_blkdev(rbd_major, RBD_DRV_NAME); +err_out_slab: + rbd_slab_exit(); return rc; } static void __exit rbd_exit(void) { rbd_sysfs_cleanup(); + if (single_major) + unregister_blkdev(rbd_major, RBD_DRV_NAME); rbd_slab_exit(); } @@@ -5428,8 -5263,9 +5369,8 @@@ module_exit(rbd_exit) MODULE_AUTHOR("Alex Elder "); MODULE_AUTHOR("Sage Weil "); MODULE_AUTHOR("Yehuda Sadeh "); -MODULE_DESCRIPTION("rados block device"); - /* following authorship retained from original osdblk.c */ MODULE_AUTHOR("Jeff Garzik "); +MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); MODULE_LICENSE("GPL"); diff --combined drivers/block/xen-blkfront.c index f9c43f91f03,26ad7923e33..8dcfb54f160 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@@ -1356,7 -1356,7 +1356,7 @@@ static int blkfront_probe(struct xenbus char *type; int len; /* no unplug has been done: do not hook devices != xen vbds */ - if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) { + if (xen_has_pv_and_legacy_disk_devices()) { int major; if (!VDEV_IS_EXTENDED(vdevice)) @@@ -1547,7 -1547,7 +1547,7 @@@ static int blkif_recover(struct blkfron for (i = 0; i < pending; i++) { offset = (i * segs * PAGE_SIZE) >> 9; size = min((unsigned int)(segs * PAGE_SIZE) >> 9, - (unsigned int)(bio->bi_size >> 9) - offset); + (unsigned int)bio_sectors(bio) - offset); cloned_bio = bio_clone(bio, GFP_NOIO); BUG_ON(cloned_bio == NULL); bio_trim(cloned_bio, offset, size); @@@ -2079,7 -2079,7 +2079,7 @@@ static int __init xlblk_init(void if (!xen_domain()) return -ENODEV; - if (xen_hvm_domain() && !xen_platform_pci_unplug) + if (!xen_has_pv_disk_devices()) return -ENODEV; if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) { diff --combined drivers/md/bcache/request.c index 61bcfc21d2a,5878cdb3952..c906571997d --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@@ -163,6 -163,7 +163,6 @@@ static struct cgroup_subsys_state *bcac static void bcachecg_destroy(struct cgroup *cgroup) { struct bch_cgroup *cg = cgroup_to_bcache(cgroup); - free_css_id(&bcache_subsys, &cg->css); kfree(cg); } @@@ -197,14 -198,14 +197,14 @@@ static bool verify(struct cached_dev *d static void bio_csum(struct bio *bio, struct bkey *k) { - struct bio_vec *bv; + struct bio_vec bv; + struct bvec_iter iter; uint64_t csum = 0; - int i; - bio_for_each_segment(bv, bio, i) { - void *d = kmap(bv->bv_page) + bv->bv_offset; - csum = bch_crc64_update(csum, d, bv->bv_len); - kunmap(bv->bv_page); + bio_for_each_segment(bv, bio, iter) { + void *d = kmap(bv.bv_page) + bv.bv_offset; + csum = bch_crc64_update(csum, d, bv.bv_len); + kunmap(bv.bv_page); } k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); @@@ -260,7 -261,7 +260,7 @@@ static void bch_data_invalidate(struct struct bio *bio = op->bio; pr_debug("invalidating %i sectors from %llu", - bio_sectors(bio), (uint64_t) bio->bi_sector); + bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector); while (bio_sectors(bio)) { unsigned sectors = min(bio_sectors(bio), @@@ -269,11 -270,11 +269,11 @@@ if (bch_keylist_realloc(&op->insert_keys, 0, op->c)) goto out; - bio->bi_sector += sectors; - bio->bi_size -= sectors << 9; + bio->bi_iter.bi_sector += sectors; + bio->bi_iter.bi_size -= sectors << 9; bch_keylist_add(&op->insert_keys, - &KEY(op->inode, bio->bi_sector, sectors)); + &KEY(op->inode, bio->bi_iter.bi_sector, sectors)); } op->insert_data_done = true; @@@ -363,14 -364,14 +363,14 @@@ static void bch_data_insert_start(struc k = op->insert_keys.top; bkey_init(k); SET_KEY_INODE(k, op->inode); - SET_KEY_OFFSET(k, bio->bi_sector); + SET_KEY_OFFSET(k, bio->bi_iter.bi_sector); if (!bch_alloc_sectors(op->c, k, bio_sectors(bio), op->write_point, op->write_prio, op->writeback)) goto err; - n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); + n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split); n->bi_end_io = bch_data_insert_endio; n->bi_private = cl; @@@ -521,7 -522,7 +521,7 @@@ static bool check_should_bypass(struct (bio->bi_rw & REQ_WRITE))) goto skip; - if (bio->bi_sector & (c->sb.block_size - 1) || + if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || bio_sectors(bio) & (c->sb.block_size - 1)) { pr_debug("skipping unaligned io"); goto skip; @@@ -545,8 -546,8 +545,8 @@@ spin_lock(&dc->io_lock); - hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) - if (i->last == bio->bi_sector && + hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash) + if (i->last == bio->bi_iter.bi_sector && time_before(jiffies, i->jiffies)) goto found; @@@ -555,8 -556,8 +555,8 @@@ add_sequential(task); i->sequential = 0; found: - if (i->sequential + bio->bi_size > i->sequential) - i->sequential += bio->bi_size; + if (i->sequential + bio->bi_iter.bi_size > i->sequential) + i->sequential += bio->bi_iter.bi_size; i->last = bio_end_sector(bio); i->jiffies = jiffies + msecs_to_jiffies(5000); @@@ -605,7 -606,6 +605,6 @@@ struct search unsigned insert_bio_sectors; unsigned recoverable:1; - unsigned unaligned_bvec:1; unsigned write:1; unsigned read_dirty_data:1; @@@ -649,15 -649,15 +648,15 @@@ static int cache_lookup_fn(struct btree struct bkey *bio_key; unsigned ptr; - if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_sector, 0)) <= 0) + if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0) return MAP_CONTINUE; if (KEY_INODE(k) != s->iop.inode || - KEY_START(k) > bio->bi_sector) { + KEY_START(k) > bio->bi_iter.bi_sector) { unsigned bio_sectors = bio_sectors(bio); unsigned sectors = KEY_INODE(k) == s->iop.inode ? min_t(uint64_t, INT_MAX, - KEY_START(k) - bio->bi_sector) + KEY_START(k) - bio->bi_iter.bi_sector) : INT_MAX; int ret = s->d->cache_miss(b, s, bio, sectors); @@@ -679,14 -679,14 +678,14 @@@ if (KEY_DIRTY(k)) s->read_dirty_data = true; - n = bch_bio_split(bio, min_t(uint64_t, INT_MAX, - KEY_OFFSET(k) - bio->bi_sector), - GFP_NOIO, s->d->bio_split); + n = bio_next_split(bio, min_t(uint64_t, INT_MAX, + KEY_OFFSET(k) - bio->bi_iter.bi_sector), + GFP_NOIO, s->d->bio_split); bio_key = &container_of(n, struct bbio, bio)->key; bch_bkey_copy_single_ptr(bio_key, k, ptr); - bch_cut_front(&KEY(s->iop.inode, n->bi_sector, 0), bio_key); + bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key); bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key); n->bi_end_io = bch_cache_read_endio; @@@ -713,7 -713,7 +712,7 @@@ static void cache_lookup(struct closur struct bio *bio = &s->bio.bio; int ret = bch_btree_map_keys(&s->op, s->iop.c, - &KEY(s->iop.inode, bio->bi_sector, 0), + &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), cache_lookup_fn, MAP_END_KEY); if (ret == -EAGAIN) continue_at(cl, cache_lookup, bcache_wq); @@@ -758,10 -758,12 +757,12 @@@ static void bio_complete(struct search static void do_bio_hook(struct search *s) { struct bio *bio = &s->bio.bio; - memcpy(bio, s->orig_bio, sizeof(struct bio)); + bio_init(bio); + __bio_clone_fast(bio, s->orig_bio); bio->bi_end_io = request_endio; bio->bi_private = &s->cl; + atomic_set(&bio->bi_cnt, 3); } @@@ -773,9 -775,6 +774,6 @@@ static void search_free(struct closure if (s->iop.bio) bio_put(s->iop.bio); - if (s->unaligned_bvec) - mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); - closure_debug_destroy(cl); mempool_free(s, s->d->c->search); } @@@ -783,7 -782,6 +781,6 @@@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d) { struct search *s; - struct bio_vec *bv; s = mempool_alloc(d->c->search, GFP_NOIO); memset(s, 0, offsetof(struct search, iop.insert_keys)); @@@ -802,15 -800,6 +799,6 @@@ s->start_time = jiffies; do_bio_hook(s); - if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) { - bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO); - memcpy(bv, bio_iovec(bio), - sizeof(struct bio_vec) * bio_segments(bio)); - - s->bio.bio.bi_io_vec = bv; - s->unaligned_bvec = 1; - } - return s; } @@@ -849,26 -838,13 +837,13 @@@ static void cached_dev_read_error(struc { struct search *s = container_of(cl, struct search, cl); struct bio *bio = &s->bio.bio; - struct bio_vec *bv; - int i; if (s->recoverable) { /* Retry from the backing device: */ trace_bcache_read_retry(s->orig_bio); s->iop.error = 0; - bv = s->bio.bio.bi_io_vec; do_bio_hook(s); - s->bio.bio.bi_io_vec = bv; - - if (!s->unaligned_bvec) - bio_for_each_segment(bv, s->orig_bio, i) - bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; - else - memcpy(s->bio.bio.bi_io_vec, - bio_iovec(s->orig_bio), - sizeof(struct bio_vec) * - bio_segments(s->orig_bio)); /* XXX: invalidate cache */ @@@ -893,9 -869,9 +868,9 @@@ static void cached_dev_read_done(struc if (s->iop.bio) { bio_reset(s->iop.bio); - s->iop.bio->bi_sector = s->cache_miss->bi_sector; + s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector; s->iop.bio->bi_bdev = s->cache_miss->bi_bdev; - s->iop.bio->bi_size = s->insert_bio_sectors << 9; + s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; bch_bio_map(s->iop.bio, NULL); bio_copy_data(s->cache_miss, s->iop.bio); @@@ -904,8 -880,7 +879,7 @@@ s->cache_miss = NULL; } - if (verify(dc, &s->bio.bio) && s->recoverable && - !s->unaligned_bvec && !s->read_dirty_data) + if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data) bch_data_verify(dc, s->orig_bio); bio_complete(s); @@@ -945,7 -920,7 +919,7 @@@ static int cached_dev_cache_miss(struc struct bio *miss, *cache_bio; if (s->cache_miss || s->iop.bypass) { - miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); + miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); ret = miss == bio ? MAP_DONE : MAP_CONTINUE; goto out_submit; } @@@ -959,7 -934,7 +933,7 @@@ s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); s->iop.replace_key = KEY(s->iop.inode, - bio->bi_sector + s->insert_bio_sectors, + bio->bi_iter.bi_sector + s->insert_bio_sectors, s->insert_bio_sectors); ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key); @@@ -968,7 -943,7 +942,7 @@@ s->iop.replace = true; - miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); + miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); /* btree_search_recurse()'s btree iterator is no good anymore */ ret = miss == bio ? MAP_DONE : -EINTR; @@@ -979,9 -954,9 +953,9 @@@ if (!cache_bio) goto out_submit; - cache_bio->bi_sector = miss->bi_sector; - cache_bio->bi_bdev = miss->bi_bdev; - cache_bio->bi_size = s->insert_bio_sectors << 9; + cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector; + cache_bio->bi_bdev = miss->bi_bdev; + cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; cache_bio->bi_end_io = request_endio; cache_bio->bi_private = &s->cl; @@@ -1031,7 -1006,7 +1005,7 @@@ static void cached_dev_write(struct cac { struct closure *cl = &s->cl; struct bio *bio = &s->bio.bio; - struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0); + struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0); struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0); bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end); @@@ -1087,8 -1062,7 +1061,7 @@@ closure_bio_submit(flush, cl, s->d); } } else { - s->iop.bio = bio_clone_bioset(bio, GFP_NOIO, - dc->disk.bio_split); + s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); closure_bio_submit(bio, cl, s->d); } @@@ -1126,13 -1100,13 +1099,13 @@@ static void cached_dev_make_request(str part_stat_unlock(); bio->bi_bdev = dc->bdev; - bio->bi_sector += dc->sb.data_offset; + bio->bi_iter.bi_sector += dc->sb.data_offset; if (cached_dev_get(dc)) { s = search_alloc(bio, d); trace_bcache_request_start(s->d, bio); - if (!bio->bi_size) { + if (!bio->bi_iter.bi_size) { /* * can't call bch_journal_meta from under * generic_make_request @@@ -1204,24 -1178,24 +1177,24 @@@ void bch_cached_dev_request_init(struc static int flash_dev_cache_miss(struct btree *b, struct search *s, struct bio *bio, unsigned sectors) { - struct bio_vec *bv; - int i; + struct bio_vec bv; + struct bvec_iter iter; /* Zero fill bio */ - bio_for_each_segment(bv, bio, i) { - unsigned j = min(bv->bv_len >> 9, sectors); + bio_for_each_segment(bv, bio, iter) { + unsigned j = min(bv.bv_len >> 9, sectors); - void *p = kmap(bv->bv_page); - memset(p + bv->bv_offset, 0, j << 9); - kunmap(bv->bv_page); + void *p = kmap(bv.bv_page); + memset(p + bv.bv_offset, 0, j << 9); + kunmap(bv.bv_page); sectors -= j; } - bio_advance(bio, min(sectors << 9, bio->bi_size)); + bio_advance(bio, min(sectors << 9, bio->bi_iter.bi_size)); - if (!bio->bi_size) + if (!bio->bi_iter.bi_size) return MAP_DONE; return MAP_CONTINUE; @@@ -1255,7 -1229,7 +1228,7 @@@ static void flash_dev_make_request(stru trace_bcache_request_start(s->d, bio); - if (!bio->bi_size) { + if (!bio->bi_iter.bi_size) { /* * can't call bch_journal_meta from under * generic_make_request @@@ -1265,7 -1239,7 +1238,7 @@@ bcache_wq); } else if (rw) { bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, - &KEY(d->id, bio->bi_sector, 0), + &KEY(d->id, bio->bi_iter.bi_sector, 0), &KEY(d->id, bio_end_sector(bio), 0)); s->iop.bypass = (bio->bi_rw & REQ_DISCARD) != 0; diff --combined drivers/md/dm-bufio.c index 9ed42125514,a1b58a65d8e..66c5d130c8c --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@@ -104,8 -104,6 +104,8 @@@ struct dm_bufio_client struct list_head reserved_buffers; unsigned need_reserved_buffers; + unsigned minimum_buffers; + struct hlist_head *cache_hash; wait_queue_head_t free_buffer_wait; @@@ -540,7 -538,7 +540,7 @@@ static void use_inline_bio(struct dm_bu bio_init(&b->bio); b->bio.bi_io_vec = b->bio_vec; b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; - b->bio.bi_sector = block << b->c->sectors_per_block_bits; + b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; b->bio.bi_bdev = b->c->bdev; b->bio.bi_end_io = end_io; @@@ -863,8 -861,8 +863,8 @@@ static void __get_memory_limit(struct d buffers = dm_bufio_cache_size_per_client >> (c->sectors_per_block_bits + SECTOR_SHIFT); - if (buffers < DM_BUFIO_MIN_BUFFERS) - buffers = DM_BUFIO_MIN_BUFFERS; + if (buffers < c->minimum_buffers) + buffers = c->minimum_buffers; *limit_buffers = buffers; *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; @@@ -1352,34 -1350,6 +1352,34 @@@ retry } EXPORT_SYMBOL_GPL(dm_bufio_release_move); +/* + * Free the given buffer. + * + * This is just a hint, if the buffer is in use or dirty, this function + * does nothing. + */ +void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) +{ + struct dm_buffer *b; + + dm_bufio_lock(c); + + b = __find(c, block); + if (b && likely(!b->hold_count) && likely(!b->state)) { + __unlink_buffer(b); + __free_buffer_wake(b); + } + + dm_bufio_unlock(c); +} +EXPORT_SYMBOL(dm_bufio_forget); + +void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n) +{ + c->minimum_buffers = n; +} +EXPORT_SYMBOL(dm_bufio_set_minimum_buffers); + unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) { return c->block_size; @@@ -1576,8 -1546,6 +1576,8 @@@ struct dm_bufio_client *dm_bufio_client INIT_LIST_HEAD(&c->reserved_buffers); c->need_reserved_buffers = reserved_buffers; + c->minimum_buffers = DM_BUFIO_MIN_BUFFERS; + init_waitqueue_head(&c->free_buffer_wait); c->async_write_error = 0; diff --combined drivers/md/dm-cache-policy-mq.c index 930e8c3d73e,d13a16865d0..1e018e98661 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@@ -72,7 -72,7 +72,7 @@@ static enum io_pattern iot_pattern(stru static void iot_update_stats(struct io_tracker *t, struct bio *bio) { - if (bio->bi_sector == from_oblock(t->last_end_oblock) + 1) + if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1) t->nr_seq_samples++; else { /* @@@ -87,7 -87,7 +87,7 @@@ t->nr_rand_samples++; } - t->last_end_oblock = to_oblock(bio->bi_sector + bio_sectors(bio) - 1); + t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1); } static void iot_check_for_pattern_switch(struct io_tracker *t) @@@ -287,8 -287,9 +287,8 @@@ static struct entry *alloc_entry(struc static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock) { struct entry *e = ep->entries + from_cblock(cblock); - list_del(&e->list); - INIT_LIST_HEAD(&e->list); + list_del_init(&e->list); INIT_HLIST_NODE(&e->hlist); ep->nr_allocated++; @@@ -390,10 -391,6 +390,10 @@@ struct mq_policy */ unsigned promote_threshold; + unsigned discard_promote_adjustment; + unsigned read_promote_adjustment; + unsigned write_promote_adjustment; + /* * The hash table allows us to quickly find an entry by origin * block. Both pre_cache and cache entries are in here. @@@ -403,10 -400,6 +403,10 @@@ struct hlist_head *table; }; +#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1 +#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4 +#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8 + /*----------------------------------------------------------------*/ /* @@@ -649,21 -642,25 +649,21 @@@ static int demote_cblock(struct mq_poli * We bias towards reads, since they can be demoted at no cost if they * haven't been dirtied. */ -#define DISCARDED_PROMOTE_THRESHOLD 1 -#define READ_PROMOTE_THRESHOLD 4 -#define WRITE_PROMOTE_THRESHOLD 8 - static unsigned adjusted_promote_threshold(struct mq_policy *mq, bool discarded_oblock, int data_dir) { if (data_dir == READ) - return mq->promote_threshold + READ_PROMOTE_THRESHOLD; + return mq->promote_threshold + mq->read_promote_adjustment; if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { /* * We don't need to do any copying at all, so give this a * very low threshold. */ - return DISCARDED_PROMOTE_THRESHOLD; + return mq->discard_promote_adjustment; } - return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD; + return mq->promote_threshold + mq->write_promote_adjustment; } static bool should_promote(struct mq_policy *mq, struct entry *e, @@@ -812,7 -809,7 +812,7 @@@ static int no_entry_found(struct mq_pol bool can_migrate, bool discarded_oblock, int data_dir, struct policy_result *result) { - if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) { + if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) { if (can_migrate) insert_in_cache(mq, oblock, result); else @@@ -1138,28 -1135,20 +1138,28 @@@ static int mq_set_config_value(struct d const char *key, const char *value) { struct mq_policy *mq = to_mq_policy(p); - enum io_pattern pattern; unsigned long tmp; - if (!strcasecmp(key, "random_threshold")) - pattern = PATTERN_RANDOM; - else if (!strcasecmp(key, "sequential_threshold")) - pattern = PATTERN_SEQUENTIAL; - else - return -EINVAL; - if (kstrtoul(value, 10, &tmp)) return -EINVAL; - mq->tracker.thresholds[pattern] = tmp; + if (!strcasecmp(key, "random_threshold")) { + mq->tracker.thresholds[PATTERN_RANDOM] = tmp; + + } else if (!strcasecmp(key, "sequential_threshold")) { + mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp; + + } else if (!strcasecmp(key, "discard_promote_adjustment")) + mq->discard_promote_adjustment = tmp; + + else if (!strcasecmp(key, "read_promote_adjustment")) + mq->read_promote_adjustment = tmp; + + else if (!strcasecmp(key, "write_promote_adjustment")) + mq->write_promote_adjustment = tmp; + + else + return -EINVAL; return 0; } @@@ -1169,16 -1158,9 +1169,16 @@@ static int mq_emit_config_values(struc ssize_t sz = 0; struct mq_policy *mq = to_mq_policy(p); - DMEMIT("4 random_threshold %u sequential_threshold %u", + DMEMIT("10 random_threshold %u " + "sequential_threshold %u " + "discard_promote_adjustment %u " + "read_promote_adjustment %u " + "write_promote_adjustment %u", mq->tracker.thresholds[PATTERN_RANDOM], - mq->tracker.thresholds[PATTERN_SEQUENTIAL]); + mq->tracker.thresholds[PATTERN_SEQUENTIAL], + mq->discard_promote_adjustment, + mq->read_promote_adjustment, + mq->write_promote_adjustment); return 0; } @@@ -1231,9 -1213,6 +1231,9 @@@ static struct dm_cache_policy *mq_creat mq->hit_count = 0; mq->generation = 0; mq->promote_threshold = 0; + mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT; + mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT; + mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT; mutex_init(&mq->lock); spin_lock_init(&mq->tick_lock); @@@ -1265,7 -1244,7 +1265,7 @@@ bad_pre_cache_init static struct dm_cache_policy_type mq_policy_type = { .name = "mq", - .version = {1, 1, 0}, + .version = {1, 2, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create @@@ -1273,11 -1252,10 +1273,11 @@@ static struct dm_cache_policy_type default_policy_type = { .name = "default", - .version = {1, 1, 0}, + .version = {1, 2, 0}, .hint_size = 4, .owner = THIS_MODULE, - .create = mq_create + .create = mq_create, + .real = &mq_policy_type }; static int __init mq_init(void) diff --combined drivers/md/dm-cache-target.c index 09334c275c7,99f91628a33..ffd472e015c --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@@ -85,6 -85,12 +85,12 @@@ static void dm_unhook_bio(struct dm_hoo { bio->bi_end_io = h->bi_end_io; bio->bi_private = h->bi_private; + + /* + * Must bump bi_remaining to allow bio to complete with + * restored bi_end_io. + */ + atomic_inc(&bio->bi_remaining); } /*----------------------------------------------------------------*/ @@@ -664,15 -670,17 +670,17 @@@ static void remap_to_origin(struct cach static void remap_to_cache(struct cache *cache, struct bio *bio, dm_cblock_t cblock) { - sector_t bi_sector = bio->bi_sector; + sector_t bi_sector = bio->bi_iter.bi_sector; bio->bi_bdev = cache->cache_dev->bdev; if (!block_size_is_power_of_two(cache)) - bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) + - sector_div(bi_sector, cache->sectors_per_block); + bio->bi_iter.bi_sector = + (from_cblock(cblock) * cache->sectors_per_block) + + sector_div(bi_sector, cache->sectors_per_block); else - bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) | - (bi_sector & (cache->sectors_per_block - 1)); + bio->bi_iter.bi_sector = + (from_cblock(cblock) << cache->sectors_per_block_shift) | + (bi_sector & (cache->sectors_per_block - 1)); } static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) @@@ -712,7 -720,7 +720,7 @@@ static void remap_to_cache_dirty(struc static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) { - sector_t block_nr = bio->bi_sector; + sector_t block_nr = bio->bi_iter.bi_sector; if (!block_size_is_power_of_two(cache)) (void) sector_div(block_nr, cache->sectors_per_block); @@@ -1027,7 -1035,7 +1035,7 @@@ static void issue_overwrite(struct dm_c static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) { return (bio_data_dir(bio) == WRITE) && - (bio->bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); + (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); } static void avoid_copy(struct dm_cache_migration *mg) @@@ -1252,7 -1260,7 +1260,7 @@@ static void process_flush_bio(struct ca size_t pb_data_size = get_per_bio_data_size(cache); struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - BUG_ON(bio->bi_size); + BUG_ON(bio->bi_iter.bi_size); if (!pb->req_nr) remap_to_origin(cache, bio); else @@@ -1275,9 -1283,9 +1283,9 @@@ */ static void process_discard_bio(struct cache *cache, struct bio *bio) { - dm_block_t start_block = dm_sector_div_up(bio->bi_sector, + dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector, cache->discard_block_size); - dm_block_t end_block = bio->bi_sector + bio_sectors(bio); + dm_block_t end_block = bio_end_sector(bio); dm_block_t b; end_block = block_div(end_block, cache->discard_block_size); @@@ -2826,13 -2834,12 +2834,13 @@@ static void cache_resume(struct dm_targ /* * Status format: * - * <#used metadata blocks>/<#total metadata blocks> + * <#used metadata blocks>/<#total metadata blocks> + * <#used cache blocks>/<#total cache blocks> * <#read hits> <#read misses> <#write hits> <#write misses> - * <#demotions> <#promotions> <#blocks in cache> <#dirty> + * <#demotions> <#promotions> <#dirty> * <#features> * * <#core args> - * <#policy args> * + * <#policy args> * */ static void cache_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) @@@ -2870,20 -2877,17 +2878,20 @@@ residency = policy_residency(cache->policy); - DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ", + DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %llu ", + (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT), (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), (unsigned long long)nr_blocks_metadata, + cache->sectors_per_block, + (unsigned long long) from_cblock(residency), + (unsigned long long) from_cblock(cache->cache_size), (unsigned) atomic_read(&cache->stats.read_hit), (unsigned) atomic_read(&cache->stats.read_miss), (unsigned) atomic_read(&cache->stats.write_hit), (unsigned) atomic_read(&cache->stats.write_miss), (unsigned) atomic_read(&cache->stats.demotion), (unsigned) atomic_read(&cache->stats.promotion), - (unsigned long long) from_cblock(residency), - cache->nr_dirty); + (unsigned long long) from_cblock(cache->nr_dirty)); if (writethrough_mode(&cache->features)) DMEMIT("1 writethrough "); @@@ -2900,8 -2904,6 +2908,8 @@@ } DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); + + DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); if (sz < maxlen) { r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz); if (r) @@@ -3135,7 -3137,7 +3143,7 @@@ static void cache_io_hints(struct dm_ta static struct target_type cache_target = { .name = "cache", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --combined drivers/md/dm-delay.c index a8a511c053a,fc8482a65dd..42c3a27a14c --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@@ -24,6 -24,7 +24,6 @@@ struct delay_c struct work_struct flush_expired_bios; struct list_head delayed_bios; atomic_t may_delay; - mempool_t *delayed_pool; struct dm_dev *dev_read; sector_t start_read; @@@ -39,11 -40,14 +39,11 @@@ struct dm_delay_info { struct delay_c *context; struct list_head list; - struct bio *bio; unsigned long expires; }; static DEFINE_MUTEX(delayed_bios_lock); -static struct kmem_cache *delayed_cache; - static void handle_delayed_timer(unsigned long data) { struct delay_c *dc = (struct delay_c *)data; @@@ -83,14 -87,13 +83,14 @@@ static struct bio *flush_delayed_bios(s mutex_lock(&delayed_bios_lock); list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { if (flush_all || time_after_eq(jiffies, delayed->expires)) { + struct bio *bio = dm_bio_from_per_bio_data(delayed, + sizeof(struct dm_delay_info)); list_del(&delayed->list); - bio_list_add(&flush_bios, delayed->bio); - if ((bio_data_dir(delayed->bio) == WRITE)) + bio_list_add(&flush_bios, bio); + if ((bio_data_dir(bio) == WRITE)) delayed->context->writes--; else delayed->context->reads--; - mempool_free(delayed, dc->delayed_pool); continue; } @@@ -182,6 -185,12 +182,6 @@@ static int delay_ctr(struct dm_target * } out: - dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache); - if (!dc->delayed_pool) { - DMERR("Couldn't create delayed bio pool."); - goto bad_dev_write; - } - dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); if (!dc->kdelayd_wq) { DMERR("Couldn't start kdelayd"); @@@ -197,11 -206,12 +197,11 @@@ ti->num_flush_bios = 1; ti->num_discard_bios = 1; + ti->per_bio_data_size = sizeof(struct dm_delay_info); ti->private = dc; return 0; bad_queue: - mempool_destroy(dc->delayed_pool); -bad_dev_write: if (dc->dev_write) dm_put_device(ti, dc->dev_write); bad_dev_read: @@@ -222,6 -232,7 +222,6 @@@ static void delay_dtr(struct dm_target if (dc->dev_write) dm_put_device(ti, dc->dev_write); - mempool_destroy(dc->delayed_pool); kfree(dc); } @@@ -233,9 -244,10 +233,9 @@@ static int delay_bio(struct delay_c *dc if (!delay || !atomic_read(&dc->may_delay)) return 1; - delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO); + delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); delayed->context = dc; - delayed->bio = bio; delayed->expires = expires = jiffies + (delay * HZ / 1000); mutex_lock(&delayed_bios_lock); @@@ -277,14 -289,15 +277,15 @@@ static int delay_map(struct dm_target * if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { bio->bi_bdev = dc->dev_write->bdev; if (bio_sectors(bio)) - bio->bi_sector = dc->start_write + - dm_target_offset(ti, bio->bi_sector); + bio->bi_iter.bi_sector = dc->start_write + + dm_target_offset(ti, bio->bi_iter.bi_sector); return delay_bio(dc, dc->write_delay, bio); } bio->bi_bdev = dc->dev_read->bdev; - bio->bi_sector = dc->start_read + dm_target_offset(ti, bio->bi_sector); + bio->bi_iter.bi_sector = dc->start_read + + dm_target_offset(ti, bio->bi_iter.bi_sector); return delay_bio(dc, dc->read_delay, bio); } @@@ -344,7 -357,13 +345,7 @@@ static struct target_type delay_target static int __init dm_delay_init(void) { - int r = -ENOMEM; - - delayed_cache = KMEM_CACHE(dm_delay_info, 0); - if (!delayed_cache) { - DMERR("Couldn't create delayed bio cache."); - goto bad_memcache; - } + int r; r = dm_register_target(&delay_target); if (r < 0) { @@@ -355,12 -374,15 +356,12 @@@ return 0; bad_register: - kmem_cache_destroy(delayed_cache); -bad_memcache: return r; } static void __exit dm_delay_exit(void) { dm_unregister_target(&delay_target); - kmem_cache_destroy(delayed_cache); } /* Module hooks */ diff --combined drivers/md/dm-snap.c index 717718558bd,01b6a11813f..ebddef5237e --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@@ -610,12 -610,12 +610,12 @@@ static struct dm_exception *dm_lookup_e return NULL; } -static struct dm_exception *alloc_completed_exception(void) +static struct dm_exception *alloc_completed_exception(gfp_t gfp) { struct dm_exception *e; - e = kmem_cache_alloc(exception_cache, GFP_NOIO); - if (!e) + e = kmem_cache_alloc(exception_cache, gfp); + if (!e && gfp == GFP_NOIO) e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); return e; @@@ -697,7 -697,7 +697,7 @@@ static int dm_add_exception(void *conte struct dm_snapshot *s = context; struct dm_exception *e; - e = alloc_completed_exception(); + e = alloc_completed_exception(GFP_KERNEL); if (!e) return -ENOMEM; @@@ -1405,7 -1405,7 +1405,7 @@@ static void pending_complete(struct dm_ goto out; } - e = alloc_completed_exception(); + e = alloc_completed_exception(GFP_NOIO); if (!e) { down_write(&s->lock); __invalidate_snapshot(s, -ENOMEM); @@@ -1438,6 -1438,7 +1438,7 @@@ out if (full_bio) { full_bio->bi_end_io = pe->full_bio_end_io; full_bio->bi_private = pe->full_bio_private; + atomic_inc(&full_bio->bi_remaining); } free_pending_exception(pe); @@@ -1619,11 -1620,10 +1620,10 @@@ static void remap_exception(struct dm_s struct bio *bio, chunk_t chunk) { bio->bi_bdev = s->cow->bdev; - bio->bi_sector = chunk_to_sector(s->store, - dm_chunk_number(e->new_chunk) + - (chunk - e->old_chunk)) + - (bio->bi_sector & - s->store->chunk_mask); + bio->bi_iter.bi_sector = + chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) + + (chunk - e->old_chunk)) + + (bio->bi_iter.bi_sector & s->store->chunk_mask); } static int snapshot_map(struct dm_target *ti, struct bio *bio) @@@ -1641,7 -1641,7 +1641,7 @@@ return DM_MAPIO_REMAPPED; } - chunk = sector_to_chunk(s->store, bio->bi_sector); + chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); /* Full snapshots are not usable */ /* To get here the table must be live so s->active is always set. */ @@@ -1702,7 -1702,8 +1702,8 @@@ r = DM_MAPIO_SUBMITTED; if (!pe->started && - bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) { + bio->bi_iter.bi_size == + (s->store->chunk_size << SECTOR_SHIFT)) { pe->started = 1; up_write(&s->lock); start_full_bio(pe, bio); @@@ -1758,7 -1759,7 +1759,7 @@@ static int snapshot_merge_map(struct dm return DM_MAPIO_REMAPPED; } - chunk = sector_to_chunk(s->store, bio->bi_sector); + chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); down_write(&s->lock); @@@ -2095,7 -2096,7 +2096,7 @@@ static int do_origin(struct dm_dev *ori down_read(&_origins_lock); o = __lookup_origin(origin->bdev); if (o) - r = __origin_write(&o->snapshots, bio->bi_sector, bio); + r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio); up_read(&_origins_lock); return r; diff --combined drivers/md/dm-thin.c index 726228b33a0,357eb272dbd..faaf944597a --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@@ -144,7 -144,6 +144,7 @@@ struct pool_features bool zero_new_blocks:1; bool discard_enabled:1; bool discard_passdown:1; + bool error_if_no_space:1; }; struct thin_c; @@@ -164,7 -163,8 +164,7 @@@ struct pool int sectors_per_block_shift; struct pool_features pf; - unsigned low_water_triggered:1; /* A dm event has been sent */ - unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ + bool low_water_triggered:1; /* A dm event has been sent */ struct dm_bio_prison *prison; struct dm_kcopyd_client *copier; @@@ -198,8 -198,7 +198,8 @@@ }; static enum pool_mode get_pool_mode(struct pool *pool); -static void set_pool_mode(struct pool *pool, enum pool_mode mode); +static void out_of_data_space(struct pool *pool); +static void metadata_operation_failed(struct pool *pool, const char *op, int r); /* * Target context for a pool. @@@ -414,7 -413,7 +414,7 @@@ static bool block_size_is_power_of_two( static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) { struct pool *pool = tc->pool; - sector_t block_nr = bio->bi_sector; + sector_t block_nr = bio->bi_iter.bi_sector; if (block_size_is_power_of_two(pool)) block_nr >>= pool->sectors_per_block_shift; @@@ -427,14 -426,15 +427,15 @@@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) { struct pool *pool = tc->pool; - sector_t bi_sector = bio->bi_sector; + sector_t bi_sector = bio->bi_iter.bi_sector; bio->bi_bdev = tc->pool_dev->bdev; if (block_size_is_power_of_two(pool)) - bio->bi_sector = (block << pool->sectors_per_block_shift) | - (bi_sector & (pool->sectors_per_block - 1)); + bio->bi_iter.bi_sector = + (block << pool->sectors_per_block_shift) | + (bi_sector & (pool->sectors_per_block - 1)); else - bio->bi_sector = (block * pool->sectors_per_block) + + bio->bi_iter.bi_sector = (block * pool->sectors_per_block) + sector_div(bi_sector, pool->sectors_per_block); } @@@ -510,16 -510,15 +511,16 @@@ static void remap_and_issue(struct thin struct dm_thin_new_mapping { struct list_head list; - unsigned quiesced:1; - unsigned prepared:1; - unsigned pass_discard:1; + bool quiesced:1; + bool prepared:1; + bool pass_discard:1; + bool definitely_not_shared:1; + int err; struct thin_c *tc; dm_block_t virt_block; dm_block_t data_block; struct dm_bio_prison_cell *cell, *cell2; - int err; /* * If the bio covers the whole area of a block then we can avoid @@@ -536,7 -535,7 +537,7 @@@ static void __maybe_add_mapping(struct struct pool *pool = m->tc->pool; if (m->quiesced && m->prepared) { - list_add(&m->list, &pool->prepared_mappings); + list_add_tail(&m->list, &pool->prepared_mappings); wake_worker(pool); } } @@@ -550,7 -549,7 +551,7 @@@ static void copy_complete(int read_err m->err = read_err || write_err ? -EIO : 0; spin_lock_irqsave(&pool->lock, flags); - m->prepared = 1; + m->prepared = true; __maybe_add_mapping(m); spin_unlock_irqrestore(&pool->lock, flags); } @@@ -565,7 -564,7 +566,7 @@@ static void overwrite_endio(struct bio m->err = err; spin_lock_irqsave(&pool->lock, flags); - m->prepared = 1; + m->prepared = true; __maybe_add_mapping(m); spin_unlock_irqrestore(&pool->lock, flags); } @@@ -612,8 -611,10 +613,10 @@@ static void cell_defer_no_holder(struc static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) { - if (m->bio) + if (m->bio) { m->bio->bi_end_io = m->saved_bi_end_io; + atomic_inc(&m->bio->bi_remaining); + } cell_error(m->tc->pool, m->cell); list_del(&m->list); mempool_free(m, m->tc->pool->mapping_pool); @@@ -627,8 -628,10 +630,10 @@@ static void process_prepared_mapping(st int r; bio = m->bio; - if (bio) + if (bio) { bio->bi_end_io = m->saved_bi_end_io; + atomic_inc(&bio->bi_remaining); + } if (m->err) { cell_error(pool, m->cell); @@@ -642,7 -645,9 +647,7 @@@ */ r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); if (r) { - DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d", - dm_device_name(pool->pool_md), r); - set_pool_mode(pool, PM_READ_ONLY); + metadata_operation_failed(pool, "dm_thin_insert_block", r); cell_error(pool, m->cell); goto out; } @@@ -683,15 -688,7 +688,15 @@@ static void process_prepared_discard_pa cell_defer_no_holder(tc, m->cell2); if (m->pass_discard) - remap_and_issue(tc, m->bio, m->data_block); + if (m->definitely_not_shared) + remap_and_issue(tc, m->bio, m->data_block); + else { + bool used = false; + if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used) + bio_endio(m->bio, 0); + else + remap_and_issue(tc, m->bio, m->data_block); + } else bio_endio(m->bio, 0); @@@ -731,7 -728,8 +736,8 @@@ static void process_prepared(struct poo */ static int io_overlaps_block(struct pool *pool, struct bio *bio) { - return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT); + return bio->bi_iter.bi_size == + (pool->sectors_per_block << SECTOR_SHIFT); } static int io_overwrites_block(struct pool *pool, struct bio *bio) @@@ -759,17 -757,13 +765,17 @@@ static int ensure_next_mapping(struct p static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) { - struct dm_thin_new_mapping *r = pool->next_mapping; + struct dm_thin_new_mapping *m = pool->next_mapping; BUG_ON(!pool->next_mapping); + memset(m, 0, sizeof(struct dm_thin_new_mapping)); + INIT_LIST_HEAD(&m->list); + m->bio = NULL; + pool->next_mapping = NULL; - return r; + return m; } static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, @@@ -781,13 -775,18 +787,13 @@@ struct pool *pool = tc->pool; struct dm_thin_new_mapping *m = get_next_mapping(pool); - INIT_LIST_HEAD(&m->list); - m->quiesced = 0; - m->prepared = 0; m->tc = tc; m->virt_block = virt_block; m->data_block = data_dest; m->cell = cell; - m->err = 0; - m->bio = NULL; if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list)) - m->quiesced = 1; + m->quiesced = true; /* * IO to pool_dev remaps to the pool target's data_dev. @@@ -847,12 -846,15 +853,12 @@@ static void schedule_zero(struct thin_ struct pool *pool = tc->pool; struct dm_thin_new_mapping *m = get_next_mapping(pool); - INIT_LIST_HEAD(&m->list); - m->quiesced = 1; - m->prepared = 0; + m->quiesced = true; + m->prepared = false; m->tc = tc; m->virt_block = virt_block; m->data_block = data_block; m->cell = cell; - m->err = 0; - m->bio = NULL; /* * If the whole block of data is being overwritten or we are not @@@ -899,42 -901,41 +905,42 @@@ static int commit(struct pool *pool return -EINVAL; r = dm_pool_commit_metadata(pool->pmd); - if (r) { - DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d", - dm_device_name(pool->pool_md), r); - set_pool_mode(pool, PM_READ_ONLY); - } + if (r) + metadata_operation_failed(pool, "dm_pool_commit_metadata", r); return r; } -static int alloc_data_block(struct thin_c *tc, dm_block_t *result) +static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks) { - int r; - dm_block_t free_blocks; unsigned long flags; - struct pool *pool = tc->pool; - - /* - * Once no_free_space is set we must not allow allocation to succeed. - * Otherwise it is difficult to explain, debug, test and support. - */ - if (pool->no_free_space) - return -ENOSPC; - - r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); - if (r) - return r; if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { DMWARN("%s: reached low water mark for data device: sending event.", dm_device_name(pool->pool_md)); spin_lock_irqsave(&pool->lock, flags); - pool->low_water_triggered = 1; + pool->low_water_triggered = true; spin_unlock_irqrestore(&pool->lock, flags); dm_table_event(pool->ti->table); } +} + +static int alloc_data_block(struct thin_c *tc, dm_block_t *result) +{ + int r; + dm_block_t free_blocks; + struct pool *pool = tc->pool; + + if (get_pool_mode(pool) != PM_WRITE) + return -EINVAL; + + r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); + if (r) { + metadata_operation_failed(pool, "dm_pool_get_free_block_count", r); + return r; + } + + check_low_water_mark(pool, free_blocks); if (!free_blocks) { /* @@@ -946,20 -947,35 +952,20 @@@ return r; r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); - if (r) + if (r) { + metadata_operation_failed(pool, "dm_pool_get_free_block_count", r); return r; + } - /* - * If we still have no space we set a flag to avoid - * doing all this checking and return -ENOSPC. This - * flag serves as a latch that disallows allocations from - * this pool until the admin takes action (e.g. resize or - * table reload). - */ if (!free_blocks) { - DMWARN("%s: no free data space available.", - dm_device_name(pool->pool_md)); - spin_lock_irqsave(&pool->lock, flags); - pool->no_free_space = 1; - spin_unlock_irqrestore(&pool->lock, flags); + out_of_data_space(pool); return -ENOSPC; } } r = dm_pool_alloc_data_block(pool->pmd, result); if (r) { - if (r == -ENOSPC && - !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) && - !free_blocks) { - DMWARN("%s: no free metadata space available.", - dm_device_name(pool->pool_md)); - set_pool_mode(pool, PM_READ_ONLY); - } + metadata_operation_failed(pool, "dm_pool_alloc_data_block", r); return r; } @@@ -982,21 -998,7 +988,21 @@@ static void retry_on_resume(struct bio spin_unlock_irqrestore(&pool->lock, flags); } -static void no_space(struct pool *pool, struct dm_bio_prison_cell *cell) +static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) +{ + /* + * When pool is read-only, no cell locking is needed because + * nothing is changing. + */ + WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY); + + if (pool->pf.error_if_no_space) + bio_io_error(bio); + else + retry_on_resume(bio); +} + +static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell) { struct bio *bio; struct bio_list bios; @@@ -1005,7 -1007,7 +1011,7 @@@ cell_release(pool, cell, &bios); while ((bio = bio_list_pop(&bios))) - retry_on_resume(bio); + handle_unserviceable_bio(pool, bio); } static void process_discard(struct thin_c *tc, struct bio *bio) @@@ -1044,17 -1046,17 +1050,17 @@@ */ m = get_next_mapping(pool); m->tc = tc; - m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown; + m->pass_discard = pool->pf.discard_passdown; + m->definitely_not_shared = !lookup_result.shared; m->virt_block = block; m->data_block = lookup_result.block; m->cell = cell; m->cell2 = cell2; - m->err = 0; m->bio = bio; if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { spin_lock_irqsave(&pool->lock, flags); - list_add(&m->list, &pool->prepared_discards); + list_add_tail(&m->list, &pool->prepared_discards); spin_unlock_irqrestore(&pool->lock, flags); wake_worker(pool); } @@@ -1109,12 -1111,13 +1115,12 @@@ static void break_sharing(struct thin_ break; case -ENOSPC: - no_space(pool, cell); + retry_bios_on_resume(pool, cell); break; default: DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", __func__, r); - set_pool_mode(pool, PM_READ_ONLY); cell_error(pool, cell); break; } @@@ -1136,7 -1139,7 +1142,7 @@@ static void process_shared_bio(struct t if (bio_detain(pool, &key, bio, &cell)) return; - if (bio_data_dir(bio) == WRITE && bio->bi_size) + if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) break_sharing(tc, bio, block, &key, lookup_result, cell); else { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); @@@ -1159,7 -1162,7 +1165,7 @@@ static void provision_block(struct thin /* * Remap empty bios (flushes) immediately, without provisioning. */ - if (!bio->bi_size) { + if (!bio->bi_iter.bi_size) { inc_all_io_entry(pool, bio); cell_defer_no_holder(tc, cell); @@@ -1187,12 -1190,13 +1193,12 @@@ break; case -ENOSPC: - no_space(pool, cell); + retry_bios_on_resume(pool, cell); break; default: DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", __func__, r); - set_pool_mode(pool, PM_READ_ONLY); cell_error(pool, cell); break; } @@@ -1258,8 -1262,8 +1264,8 @@@ static void process_bio_read_only(struc r = dm_thin_find_block(tc->td, block, 1, &lookup_result); switch (r) { case 0: - if (lookup_result.shared && (rw == WRITE) && bio->bi_size) + if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) - bio_io_error(bio); + handle_unserviceable_bio(tc->pool, bio); else { inc_all_io_entry(tc->pool, bio); remap_and_issue(tc, bio, lookup_result.block); @@@ -1268,7 -1272,7 +1274,7 @@@ case -ENODATA: if (rw != READ) { - bio_io_error(bio); + handle_unserviceable_bio(tc->pool, bio); break; } @@@ -1392,16 -1396,16 +1398,16 @@@ static enum pool_mode get_pool_mode(str return pool->pf.mode; } -static void set_pool_mode(struct pool *pool, enum pool_mode mode) +static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) { int r; + enum pool_mode old_mode = pool->pf.mode; - pool->pf.mode = mode; - - switch (mode) { + switch (new_mode) { case PM_FAIL: - DMERR("%s: switching pool to failure mode", - dm_device_name(pool->pool_md)); + if (old_mode != new_mode) + DMERR("%s: switching pool to failure mode", + dm_device_name(pool->pool_md)); dm_pool_metadata_read_only(pool->pmd); pool->process_bio = process_bio_fail; pool->process_discard = process_bio_fail; @@@ -1410,15 -1414,13 +1416,15 @@@ break; case PM_READ_ONLY: - DMERR("%s: switching pool to read-only mode", - dm_device_name(pool->pool_md)); + if (old_mode != new_mode) + DMERR("%s: switching pool to read-only mode", + dm_device_name(pool->pool_md)); r = dm_pool_abort_metadata(pool->pmd); if (r) { DMERR("%s: aborting transaction failed", dm_device_name(pool->pool_md)); - set_pool_mode(pool, PM_FAIL); + new_mode = PM_FAIL; + set_pool_mode(pool, new_mode); } else { dm_pool_metadata_read_only(pool->pmd); pool->process_bio = process_bio_read_only; @@@ -1429,9 -1431,6 +1435,9 @@@ break; case PM_WRITE: + if (old_mode != new_mode) + DMINFO("%s: switching pool to write mode", + dm_device_name(pool->pool_md)); dm_pool_metadata_read_write(pool->pmd); pool->process_bio = process_bio; pool->process_discard = process_discard; @@@ -1439,35 -1438,6 +1445,35 @@@ pool->process_prepared_discard = process_prepared_discard; break; } + + pool->pf.mode = new_mode; +} + +/* + * Rather than calling set_pool_mode directly, use these which describe the + * reason for mode degradation. + */ +static void out_of_data_space(struct pool *pool) +{ + DMERR_LIMIT("%s: no free data space available.", + dm_device_name(pool->pool_md)); + set_pool_mode(pool, PM_READ_ONLY); +} + +static void metadata_operation_failed(struct pool *pool, const char *op, int r) +{ + dm_block_t free_blocks; + + DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", + dm_device_name(pool->pool_md), op, r); + + if (r == -ENOSPC && + !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) && + !free_blocks) + DMERR_LIMIT("%s: no free metadata space available.", + dm_device_name(pool->pool_md)); + + set_pool_mode(pool, PM_READ_ONLY); } /*----------------------------------------------------------------*/ @@@ -1574,9 -1544,9 +1580,9 @@@ static int thin_bio_map(struct dm_targe if (get_pool_mode(tc->pool) == PM_READ_ONLY) { /* * This block isn't provisioned, and we have no way - * of doing so. Just error it. + * of doing so. */ - bio_io_error(bio); + handle_unserviceable_bio(tc->pool, bio); return DM_MAPIO_SUBMITTED; } /* fall through */ @@@ -1683,17 -1653,6 +1689,17 @@@ static int bind_control_target(struct p enum pool_mode old_mode = pool->pf.mode; enum pool_mode new_mode = pt->adjusted_pf.mode; + /* + * Don't change the pool's mode until set_pool_mode() below. + * Otherwise the pool's process_* function pointers may + * not match the desired pool mode. + */ + pt->adjusted_pf.mode = old_mode; + + pool->ti = ti; + pool->pf = pt->adjusted_pf; + pool->low_water_blocks = pt->low_water_blocks; + /* * If we were in PM_FAIL mode, rollback of metadata failed. We're * not going to recover without a thin_repair. So we never let the @@@ -1704,6 -1663,10 +1710,6 @@@ if (old_mode == PM_FAIL) new_mode = old_mode; - pool->ti = ti; - pool->low_water_blocks = pt->low_water_blocks; - pool->pf = pt->adjusted_pf; - set_pool_mode(pool, new_mode); return 0; @@@ -1725,7 -1688,6 +1731,7 @@@ static void pool_features_init(struct p pf->zero_new_blocks = true; pf->discard_enabled = true; pf->discard_passdown = true; + pf->error_if_no_space = false; } static void __pool_destroy(struct pool *pool) @@@ -1816,7 -1778,8 +1822,7 @@@ static struct pool *pool_create(struct bio_list_init(&pool->deferred_flush_bios); INIT_LIST_HEAD(&pool->prepared_mappings); INIT_LIST_HEAD(&pool->prepared_discards); - pool->low_water_triggered = 0; - pool->no_free_space = 0; + pool->low_water_triggered = false; bio_list_init(&pool->retry_on_resume_list); pool->shared_read_ds = dm_deferred_set_create(); @@@ -1941,7 -1904,7 +1947,7 @@@ static int parse_pool_features(struct d const char *arg_name; static struct dm_arg _args[] = { - {0, 3, "Invalid number of pool feature arguments"}, + {0, 4, "Invalid number of pool feature arguments"}, }; /* @@@ -1970,9 -1933,6 +1976,9 @@@ else if (!strcasecmp(arg_name, "read_only")) pf->mode = PM_READ_ONLY; + else if (!strcasecmp(arg_name, "error_if_no_space")) + pf->error_if_no_space = true; + else { ti->error = "Unrecognised pool feature requested"; r = -EINVAL; @@@ -2043,8 -2003,6 +2049,8 @@@ static dm_block_t calc_metadata_thresho * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. * ignore_discard: disable discard * no_discard_passdown: don't pass discards down to the data device + * read_only: Don't allow any changes to be made to the pool metadata. + * error_if_no_space: error IOs, instead of queueing, if no space. */ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) { @@@ -2240,13 -2198,11 +2246,13 @@@ static int maybe_resize_data_dev(struc return -EINVAL; } else if (data_size > sb_data_size) { + if (sb_data_size) + DMINFO("%s: growing the data device from %llu to %llu blocks", + dm_device_name(pool->pool_md), + sb_data_size, (unsigned long long)data_size); r = dm_pool_resize_data_dev(pool->pmd, data_size); if (r) { - DMERR("%s: failed to resize data device", - dm_device_name(pool->pool_md)); - set_pool_mode(pool, PM_READ_ONLY); + metadata_operation_failed(pool, "dm_pool_resize_data_dev", r); return r; } @@@ -2281,12 -2237,10 +2287,12 @@@ static int maybe_resize_metadata_dev(st return -EINVAL; } else if (metadata_dev_size > sb_metadata_dev_size) { + DMINFO("%s: growing the metadata device from %llu to %llu blocks", + dm_device_name(pool->pool_md), + sb_metadata_dev_size, metadata_dev_size); r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); if (r) { - DMERR("%s: failed to resize metadata device", - dm_device_name(pool->pool_md)); + metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r); return r; } @@@ -2342,7 -2296,8 +2348,7 @@@ static void pool_resume(struct dm_targe unsigned long flags; spin_lock_irqsave(&pool->lock, flags); - pool->low_water_triggered = 0; - pool->no_free_space = 0; + pool->low_water_triggered = false; __requeue_bios(pool); spin_unlock_irqrestore(&pool->lock, flags); @@@ -2561,8 -2516,7 +2567,8 @@@ static void emit_flags(struct pool_feat unsigned sz, unsigned maxlen) { unsigned count = !pf->zero_new_blocks + !pf->discard_enabled + - !pf->discard_passdown + (pf->mode == PM_READ_ONLY); + !pf->discard_passdown + (pf->mode == PM_READ_ONLY) + + pf->error_if_no_space; DMEMIT("%u ", count); if (!pf->zero_new_blocks) @@@ -2576,9 -2530,6 +2582,9 @@@ if (pf->mode == PM_READ_ONLY) DMEMIT("read_only "); + + if (pf->error_if_no_space) + DMEMIT("error_if_no_space "); } /* @@@ -2673,16 -2624,11 +2679,16 @@@ static void pool_status(struct dm_targe DMEMIT("rw "); if (!pool->pf.discard_enabled) - DMEMIT("ignore_discard"); + DMEMIT("ignore_discard "); else if (pool->pf.discard_passdown) - DMEMIT("discard_passdown"); + DMEMIT("discard_passdown "); + else + DMEMIT("no_discard_passdown "); + + if (pool->pf.error_if_no_space) + DMEMIT("error_if_no_space "); else - DMEMIT("no_discard_passdown"); + DMEMIT("queue_if_no_space "); break; @@@ -2781,7 -2727,7 +2787,7 @@@ static struct target_type pool_target .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 9, 0}, + .version = {1, 10, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@@ -2939,7 -2885,7 +2945,7 @@@ out_unlock static int thin_map(struct dm_target *ti, struct bio *bio) { - bio->bi_sector = dm_target_offset(ti, bio->bi_sector); + bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); return thin_bio_map(ti, bio); } @@@ -2959,7 -2905,7 +2965,7 @@@ static int thin_endio(struct dm_target spin_lock_irqsave(&pool->lock, flags); list_for_each_entry_safe(m, tmp, &work, list) { list_del(&m->list); - m->quiesced = 1; + m->quiesced = true; __maybe_add_mapping(m); } spin_unlock_irqrestore(&pool->lock, flags); @@@ -2971,7 -2917,7 +2977,7 @@@ if (!list_empty(&work)) { spin_lock_irqsave(&pool->lock, flags); list_for_each_entry_safe(m, tmp, &work, list) - list_add(&m->list, &pool->prepared_discards); + list_add_tail(&m->list, &pool->prepared_discards); spin_unlock_irqrestore(&pool->lock, flags); wake_worker(pool); } @@@ -3068,7 -3014,7 +3074,7 @@@ static int thin_iterate_devices(struct static struct target_type thin_target = { .name = "thin", - .version = {1, 9, 0}, + .version = {1, 10, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --combined drivers/md/dm.c index b49c7628424,44a2fa6814c..8c53b09b9a2 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@@ -200,8 -200,8 +200,8 @@@ struct mapped_device /* forced geometry settings */ struct hd_geometry geometry; - /* sysfs handle */ - struct kobject kobj; + /* kobject and completion */ + struct dm_kobject_holder kobj_holder; /* zero-length flush that will be cloned and submitted to targets */ struct bio flush_bio; @@@ -575,7 -575,7 +575,7 @@@ static void start_io_acct(struct dm_io atomic_inc_return(&md->pending[rw])); if (unlikely(dm_stats_used(&md->stats))) - dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector, + dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, bio_sectors(bio), false, 0, &io->stats_aux); } @@@ -593,7 -593,7 +593,7 @@@ static void end_io_acct(struct dm_io *i part_stat_unlock(); if (unlikely(dm_stats_used(&md->stats))) - dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector, + dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, bio_sectors(bio), true, duration, &io->stats_aux); /* @@@ -742,7 -742,7 +742,7 @@@ static void dec_pending(struct dm_io *i if (io_error == DM_ENDIO_REQUEUE) return; - if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { + if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) { /* * Preflush done for flush with data, reissue * without REQ_FLUSH. @@@ -797,7 -797,7 +797,7 @@@ static void end_clone_bio(struct bio *c struct dm_rq_clone_bio_info *info = clone->bi_private; struct dm_rq_target_io *tio = info->tio; struct bio *bio = info->orig; - unsigned int nr_bytes = info->orig->bi_size; + unsigned int nr_bytes = info->orig->bi_iter.bi_size; bio_put(clone); @@@ -1128,7 -1128,7 +1128,7 @@@ static void __map_bio(struct dm_target_ * this io. */ atomic_inc(&tio->io->io_count); - sector = clone->bi_sector; + sector = clone->bi_iter.bi_sector; r = ti->type->map(ti, clone); if (r == DM_MAPIO_REMAPPED) { /* the bio has been remapped so dispatch it */ @@@ -1155,76 -1155,32 +1155,32 @@@ struct clone_info struct dm_io *io; sector_t sector; sector_t sector_count; - unsigned short idx; }; static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len) { - bio->bi_sector = sector; - bio->bi_size = to_bytes(len); - } - - static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count) - { - bio->bi_idx = idx; - bio->bi_vcnt = idx + bv_count; - bio->bi_flags &= ~(1 << BIO_SEG_VALID); - } - - static void clone_bio_integrity(struct bio *bio, struct bio *clone, - unsigned short idx, unsigned len, unsigned offset, - unsigned trim) - { - if (!bio_integrity(bio)) - return; - - bio_integrity_clone(clone, bio, GFP_NOIO); - - if (trim) - bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len); - } - - /* - * Creates a little bio that just does part of a bvec. - */ - static void clone_split_bio(struct dm_target_io *tio, struct bio *bio, - sector_t sector, unsigned short idx, - unsigned offset, unsigned len) - { - struct bio *clone = &tio->clone; - struct bio_vec *bv = bio->bi_io_vec + idx; - - *clone->bi_io_vec = *bv; - - bio_setup_sector(clone, sector, len); - - clone->bi_bdev = bio->bi_bdev; - clone->bi_rw = bio->bi_rw; - clone->bi_vcnt = 1; - clone->bi_io_vec->bv_offset = offset; - clone->bi_io_vec->bv_len = clone->bi_size; - clone->bi_flags |= 1 << BIO_CLONED; - - clone_bio_integrity(bio, clone, idx, len, offset, 1); + bio->bi_iter.bi_sector = sector; + bio->bi_iter.bi_size = to_bytes(len); } /* * Creates a bio that consists of range of complete bvecs. */ static void clone_bio(struct dm_target_io *tio, struct bio *bio, - sector_t sector, unsigned short idx, - unsigned short bv_count, unsigned len) + sector_t sector, unsigned len) { struct bio *clone = &tio->clone; - unsigned trim = 0; - __bio_clone(clone, bio); - bio_setup_sector(clone, sector, len); - bio_setup_bv(clone, idx, bv_count); + __bio_clone_fast(clone, bio); + + if (bio_integrity(bio)) + bio_integrity_clone(clone, bio, GFP_NOIO); - if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) - trim = 1; - clone_bio_integrity(bio, clone, idx, len, 0, trim); + bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); + clone->bi_iter.bi_size = to_bytes(len); + + if (bio_integrity(bio)) + bio_integrity_trim(clone, 0, len); } static struct dm_target_io *alloc_tio(struct clone_info *ci, @@@ -1257,7 -1213,7 +1213,7 @@@ static void __clone_and_map_simple_bio( * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush * and discard, so no need for concern about wasted bvec allocations. */ - __bio_clone(clone, ci->bio); + __bio_clone_fast(clone, ci->bio); if (len) bio_setup_sector(clone, ci->sector, len); @@@ -1286,10 -1242,7 +1242,7 @@@ static int __send_empty_flush(struct cl } static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, - sector_t sector, int nr_iovecs, - unsigned short idx, unsigned short bv_count, - unsigned offset, unsigned len, - unsigned split_bvec) + sector_t sector, unsigned len) { struct bio *bio = ci->bio; struct dm_target_io *tio; @@@ -1303,11 -1256,8 +1256,8 @@@ num_target_bios = ti->num_write_bios(ti, bio); for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { - tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr); - if (split_bvec) - clone_split_bio(tio, bio, sector, idx, offset, len); - else - clone_bio(tio, bio, sector, idx, bv_count, len); + tio = alloc_tio(ci, ti, 0, target_bio_nr); + clone_bio(tio, bio, sector, len); __map_bio(tio); } } @@@ -1378,60 -1328,6 +1328,6 @@@ static int __send_write_same(struct clo return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); } - /* - * Find maximum number of sectors / bvecs we can process with a single bio. - */ - static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx) - { - struct bio *bio = ci->bio; - sector_t bv_len, total_len = 0; - - for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) { - bv_len = to_sector(bio->bi_io_vec[*idx].bv_len); - - if (bv_len > max) - break; - - max -= bv_len; - total_len += bv_len; - } - - return total_len; - } - - static int __split_bvec_across_targets(struct clone_info *ci, - struct dm_target *ti, sector_t max) - { - struct bio *bio = ci->bio; - struct bio_vec *bv = bio->bi_io_vec + ci->idx; - sector_t remaining = to_sector(bv->bv_len); - unsigned offset = 0; - sector_t len; - - do { - if (offset) { - ti = dm_table_find_target(ci->map, ci->sector); - if (!dm_target_is_valid(ti)) - return -EIO; - - max = max_io_len(ci->sector, ti); - } - - len = min(remaining, max); - - __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0, - bv->bv_offset + offset, len, 1); - - ci->sector += len; - ci->sector_count -= len; - offset += to_bytes(len); - } while (remaining -= len); - - ci->idx++; - - return 0; - } - /* * Select the correct strategy for processing a non-flush bio. */ @@@ -1439,8 -1335,7 +1335,7 @@@ static int __split_and_process_non_flus { struct bio *bio = ci->bio; struct dm_target *ti; - sector_t len, max; - int idx; + unsigned len; if (unlikely(bio->bi_rw & REQ_DISCARD)) return __send_discard(ci); @@@ -1451,41 -1346,14 +1346,14 @@@ if (!dm_target_is_valid(ti)) return -EIO; - max = max_io_len(ci->sector, ti); - - /* - * Optimise for the simple case where we can do all of - * the remaining io with a single clone. - */ - if (ci->sector_count <= max) { - __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs, - ci->idx, bio->bi_vcnt - ci->idx, 0, - ci->sector_count, 0); - ci->sector_count = 0; - return 0; - } + len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); - /* - * There are some bvecs that don't span targets. - * Do as many of these as possible. - */ - if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { - len = __len_within_target(ci, max, &idx); + __clone_and_map_data_bio(ci, ti, ci->sector, len); - __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs, - ci->idx, idx - ci->idx, 0, len, 0); + ci->sector += len; + ci->sector_count -= len; - ci->sector += len; - ci->sector_count -= len; - ci->idx = idx; - - return 0; - } - - /* - * Handle a bvec that must be split between two or more targets. - */ - return __split_bvec_across_targets(ci, ti, max); + return 0; } /* @@@ -1510,8 -1378,7 +1378,7 @@@ static void __split_and_process_bio(str ci.io->bio = bio; ci.io->md = md; spin_lock_init(&ci.io->endio_lock); - ci.sector = bio->bi_sector; - ci.idx = bio->bi_idx; + ci.sector = bio->bi_iter.bi_sector; start_io_acct(ci.io); @@@ -2041,7 -1908,6 +1908,7 @@@ static struct mapped_device *alloc_dev( init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); + init_completion(&md->kobj_holder.completion); md->disk->major = _major; md->disk->first_minor = minor; @@@ -2903,14 -2769,20 +2770,14 @@@ struct gendisk *dm_disk(struct mapped_d struct kobject *dm_kobject(struct mapped_device *md) { - return &md->kobj; + return &md->kobj_holder.kobj; } -/* - * struct mapped_device should not be exported outside of dm.c - * so use this check to verify that kobj is part of md structure - */ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) { struct mapped_device *md; - md = container_of(kobj, struct mapped_device, kobj); - if (&md->kobj != kobj) - return NULL; + md = container_of(kobj, struct mapped_device, kobj_holder.kobj); if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) diff --combined drivers/md/md.c index 40c531359a1,16d84e091e2..4ad5cc4e63e --- a/drivers/md/md.c +++ b/drivers/md/md.c @@@ -393,7 -393,7 +393,7 @@@ static void md_submit_flush_data(struc struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct bio *bio = mddev->flush_bio; - if (bio->bi_size == 0) + if (bio->bi_iter.bi_size == 0) /* an empty barrier - all done */ bio_endio(bio, 0); else { @@@ -754,7 -754,7 +754,7 @@@ void md_super_write(struct mddev *mddev struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; @@@ -782,18 -782,16 +782,16 @@@ int sync_page_io(struct md_rdev *rdev, struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); int ret; - rw |= REQ_SYNC; - bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; if (metadata_op) - bio->bi_sector = sector + rdev->sb_start; + bio->bi_iter.bi_sector = sector + rdev->sb_start; else if (rdev->mddev->reshape_position != MaxSector && (rdev->mddev->reshape_backwards == (sector >= rdev->mddev->reshape_position))) - bio->bi_sector = sector + rdev->new_data_offset; + bio->bi_iter.bi_sector = sector + rdev->new_data_offset; else - bio->bi_sector = sector + rdev->data_offset; + bio->bi_iter.bi_sector = sector + rdev->data_offset; bio_add_page(bio, page, size, 0); submit_bio_wait(rw, bio); @@@ -1077,7 -1075,6 +1075,7 @@@ static int super_90_validate(struct mdd rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); if (mddev->raid_disks == 0) { @@@ -1156,8 -1153,6 +1154,8 @@@ */ if (ev1 < mddev->bitmap->events_cleared) return 0; + if (ev1 < mddev->events) + set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ @@@ -1173,7 -1168,6 +1171,7 @@@ desc->raid_disk < mddev->raid_disks */) { set_bit(In_sync, &rdev->flags); rdev->raid_disk = desc->raid_disk; + rdev->saved_raid_disk = desc->raid_disk; } else if (desc->state & (1<raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); if (mddev->raid_disks == 0) { @@@ -1650,8 -1643,6 +1648,8 @@@ */ if (ev1 < mddev->bitmap->events_cleared) return 0; + if (ev1 < mddev->events) + set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ @@@ -1672,14 -1663,10 +1670,14 @@@ set_bit(Faulty, &rdev->flags); break; default: + rdev->saved_raid_disk = role; if ((le32_to_cpu(sb->feature_map) & - MD_FEATURE_RECOVERY_OFFSET)) + MD_FEATURE_RECOVERY_OFFSET)) { rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); - else + if (!(le32_to_cpu(sb->feature_map) & + MD_FEATURE_RECOVERY_BITMAP)) + rdev->saved_raid_disk = -1; + } else set_bit(In_sync, &rdev->flags); rdev->raid_disk = role; break; @@@ -1741,9 -1728,6 +1739,9 @@@ static void super_1_sync(struct mddev * cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); + if (rdev->saved_raid_disk >= 0 && mddev->bitmap) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); } if (test_bit(Replacement, &rdev->flags)) sb->feature_map |= @@@ -2485,7 -2469,8 +2483,7 @@@ repeat if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ - if (!test_bit(Faulty, &rdev->flags) && - rdev->saved_raid_disk == -1) { + if (!test_bit(Faulty, &rdev->flags)) { md_super_write(mddev,rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); @@@ -2501,9 -2486,11 +2499,9 @@@ rdev->badblocks.size = 0; } - } else if (test_bit(Faulty, &rdev->flags)) + } else pr_debug("md: %s (skipping faulty)\n", bdevname(rdev->bdev, b)); - else - pr_debug("(skipping incremental s/r "); if (mddev->level == LEVEL_MULTIPATH) /* only need to write one superblock... */ @@@ -2619,8 -2606,6 +2617,8 @@@ state_store(struct md_rdev *rdev, cons * blocked - sets the Blocked flags * -blocked - clears the Blocked and possibly simulates an error * insync - sets Insync providing device isn't active + * -insync - clear Insync for a device with a slot assigned, + * so that it gets rebuilt based on bitmap * write_error - sets WriteErrorSeen * -write_error - clears WriteErrorSeen */ @@@ -2669,11 -2654,6 +2667,11 @@@ } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { set_bit(In_sync, &rdev->flags); err = 0; + } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) { + clear_bit(In_sync, &rdev->flags); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->raid_disk = -1; + err = 0; } else if (cmd_match(buf, "write_error")) { set_bit(WriteErrorSeen, &rdev->flags); err = 0; @@@ -2806,7 -2786,6 +2804,7 @@@ slot_store(struct md_rdev *rdev, const else rdev->saved_raid_disk = -1; clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); err = rdev->mddev->pers-> hot_add_disk(rdev->mddev, rdev); if (err) { @@@ -3601,8 -3580,6 +3599,8 @@@ level_store(struct mddev *mddev, const pers->run(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); mddev_resume(mddev); + if (!mddev->thread) + md_update_sb(mddev, 1); sysfs_notify(&mddev->kobj, NULL, "level"); md_new_event(mddev); return rv; @@@ -5781,10 -5758,8 +5779,10 @@@ static int add_new_disk(struct mddev * info->raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; set_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); } else rdev->raid_disk = -1; + rdev->saved_raid_disk = rdev->raid_disk; } else super_types[mddev->major_version]. validate_super(mddev, rdev); @@@ -5797,6 -5772,11 +5795,6 @@@ return -EINVAL; } - if (test_bit(In_sync, &rdev->flags)) - rdev->saved_raid_disk = rdev->raid_disk; - else - rdev->saved_raid_disk = -1; - clear_bit(In_sync, &rdev->flags); /* just to be sure */ if (info->state & (1<flags); @@@ -6346,32 -6326,6 +6344,32 @@@ static int md_getgeo(struct block_devic return 0; } +static inline bool md_ioctl_valid(unsigned int cmd) +{ + switch (cmd) { + case ADD_NEW_DISK: + case BLKROSET: + case GET_ARRAY_INFO: + case GET_BITMAP_FILE: + case GET_DISK_INFO: + case HOT_ADD_DISK: + case HOT_REMOVE_DISK: + case PRINT_RAID_DEBUG: + case RAID_AUTORUN: + case RAID_VERSION: + case RESTART_ARRAY_RW: + case RUN_ARRAY: + case SET_ARRAY_INFO: + case SET_BITMAP_FILE: + case SET_DISK_FAULTY: + case STOP_ARRAY: + case STOP_ARRAY_RO: + return true; + default: + return false; + } +} + static int md_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@@ -6380,9 -6334,6 +6378,9 @@@ struct mddev *mddev = NULL; int ro; + if (!md_ioctl_valid(cmd)) + return -ENOTTY; + switch (cmd) { case RAID_VERSION: case GET_ARRAY_INFO: @@@ -7753,12 -7704,10 +7751,12 @@@ static int remove_and_add_spares(struc if (test_bit(Faulty, &rdev->flags)) continue; if (mddev->ro && - rdev->saved_raid_disk < 0) + ! (rdev->saved_raid_disk >= 0 && + !test_bit(Bitmap_sync, &rdev->flags))) continue; - rdev->recovery_offset = 0; + if (rdev->saved_raid_disk < 0) + rdev->recovery_offset = 0; if (mddev->pers-> hot_add_disk(mddev, rdev) == 0) { if (sysfs_link_rdev(mddev, rdev)) @@@ -7836,12 -7785,9 +7834,12 @@@ void md_check_recovery(struct mddev *md * As we only add devices that are already in-sync, * we can activate the spares immediately. */ - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); remove_and_add_spares(mddev, NULL); - mddev->pers->spare_active(mddev); + /* There is no thread, but we need to call + * ->spare_active and clear saved_raid_disk + */ + md_reap_sync_thread(mddev); + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); goto unlock; } @@@ -7978,10 -7924,14 +7976,10 @@@ void md_reap_sync_thread(struct mddev * mddev->pers->finish_reshape(mddev); /* If array is no-longer degraded, then any saved_raid_disk - * information must be scrapped. Also if any device is now - * In_sync we must scrape the saved_raid_disk for that device - * do the superblock for an incrementally recovered device - * written out. + * information must be scrapped. */ - rdev_for_each(rdev, mddev) - if (!mddev->degraded || - test_bit(In_sync, &rdev->flags)) + if (!mddev->degraded) + rdev_for_each(rdev, mddev) rdev->saved_raid_disk = -1; md_update_sb(mddev, 1); diff --combined drivers/md/raid1.c index a49cfcc7a34,db3b9d7314f..fd3a2a14b58 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@@ -229,7 -229,7 +229,7 @@@ static void call_bio_endio(struct r1bi int done; struct r1conf *conf = r1_bio->mddev->private; sector_t start_next_window = r1_bio->start_next_window; - sector_t bi_sector = bio->bi_sector; + sector_t bi_sector = bio->bi_iter.bi_sector; if (bio->bi_phys_segments) { unsigned long flags; @@@ -265,9 -265,8 +265,8 @@@ static void raid_end_bio_io(struct r1bi if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { pr_debug("raid1: sync end %s on sectors %llu-%llu\n", (bio_data_dir(bio) == WRITE) ? "write" : "read", - (unsigned long long) bio->bi_sector, - (unsigned long long) bio->bi_sector + - bio_sectors(bio) - 1); + (unsigned long long) bio->bi_iter.bi_sector, + (unsigned long long) bio_end_sector(bio) - 1); call_bio_endio(r1_bio); } @@@ -466,9 -465,8 +465,8 @@@ static void raid1_end_write_request(str struct bio *mbio = r1_bio->master_bio; pr_debug("raid1: behind end write sectors" " %llu-%llu\n", - (unsigned long long) mbio->bi_sector, - (unsigned long long) mbio->bi_sector + - bio_sectors(mbio) - 1); + (unsigned long long) mbio->bi_iter.bi_sector, + (unsigned long long) bio_end_sector(mbio) - 1); call_bio_endio(r1_bio); } } @@@ -875,7 -873,7 +873,7 @@@ static bool need_to_wait_for_sync(struc else if ((conf->next_resync - RESYNC_WINDOW_SECTORS >= bio_end_sector(bio)) || (conf->next_resync + NEXT_NORMALIO_DISTANCE - <= bio->bi_sector)) + <= bio->bi_iter.bi_sector)) wait = false; else wait = true; @@@ -913,19 -911,20 +911,19 @@@ static sector_t wait_barrier(struct r1c if (bio && bio_data_dir(bio) == WRITE) { if (conf->next_resync + NEXT_NORMALIO_DISTANCE - <= bio->bi_sector) { + <= bio->bi_iter.bi_sector) { if (conf->start_next_window == MaxSector) conf->start_next_window = conf->next_resync + NEXT_NORMALIO_DISTANCE; if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) - <= bio->bi_sector) + <= bio->bi_iter.bi_sector) conf->next_window_requests++; else conf->current_window_requests++; - } - if (bio->bi_iter.bi_sector >= conf->start_next_window) sector = conf->start_next_window; + } } conf->nr_pending++; @@@ -1027,7 -1026,8 +1025,8 @@@ do_sync_io if (bvecs[i].bv_page) put_page(bvecs[i].bv_page); kfree(bvecs); - pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); + pr_debug("%dB behind alloc failed, doing sync I/O\n", + bio->bi_iter.bi_size); } struct raid1_plug_cb { @@@ -1107,7 -1107,7 +1106,7 @@@ static void make_request(struct mddev * if (bio_data_dir(bio) == WRITE && bio_end_sector(bio) > mddev->suspend_lo && - bio->bi_sector < mddev->suspend_hi) { + bio->bi_iter.bi_sector < mddev->suspend_hi) { /* As the suspend_* range is controlled by * userspace, we want an interruptible * wait. @@@ -1118,7 -1118,7 +1117,7 @@@ prepare_to_wait(&conf->wait_barrier, &w, TASK_INTERRUPTIBLE); if (bio_end_sector(bio) <= mddev->suspend_lo || - bio->bi_sector >= mddev->suspend_hi) + bio->bi_iter.bi_sector >= mddev->suspend_hi) break; schedule(); } @@@ -1140,7 -1140,7 +1139,7 @@@ r1_bio->sectors = bio_sectors(bio); r1_bio->state = 0; r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_sector; + r1_bio->sector = bio->bi_iter.bi_sector; /* We might need to issue multiple reads to different * devices if there are bad blocks around, so we keep @@@ -1180,12 -1180,13 +1179,13 @@@ read_again r1_bio->read_disk = rdisk; read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); - bio_trim(read_bio, r1_bio->sector - bio->bi_sector, + bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors); r1_bio->bios[rdisk] = read_bio; - read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; + read_bio->bi_iter.bi_sector = r1_bio->sector + + mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid1_end_read_request; read_bio->bi_rw = READ | do_sync; @@@ -1197,7 -1198,7 +1197,7 @@@ */ sectors_handled = (r1_bio->sector + max_sectors - - bio->bi_sector); + - bio->bi_iter.bi_sector); r1_bio->sectors = max_sectors; spin_lock_irq(&conf->device_lock); if (bio->bi_phys_segments == 0) @@@ -1218,7 -1219,8 +1218,8 @@@ r1_bio->sectors = bio_sectors(bio) - sectors_handled; r1_bio->state = 0; r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_sector + sectors_handled; + r1_bio->sector = bio->bi_iter.bi_sector + + sectors_handled; goto read_again; } else generic_make_request(read_bio); @@@ -1321,7 -1323,7 +1322,7 @@@ if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); r1_bio->state = 0; - allow_barrier(conf, start_next_window, bio->bi_sector); + allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); md_wait_for_blocked_rdev(blocked_rdev, mddev); start_next_window = wait_barrier(conf, bio); /* @@@ -1348,7 -1350,7 +1349,7 @@@ bio->bi_phys_segments++; spin_unlock_irq(&conf->device_lock); } - sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector; + sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector; atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0); @@@ -1360,7 -1362,7 +1361,7 @@@ continue; mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - bio_trim(mbio, r1_bio->sector - bio->bi_sector, max_sectors); + bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors); if (first_clone) { /* do behind I/O ? @@@ -1394,7 -1396,7 +1395,7 @@@ r1_bio->bios[i] = mbio; - mbio->bi_sector = (r1_bio->sector + + mbio->bi_iter.bi_sector = (r1_bio->sector + conf->mirrors[i].rdev->data_offset); mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; @@@ -1434,7 -1436,7 +1435,7 @@@ r1_bio->sectors = bio_sectors(bio) - sectors_handled; r1_bio->state = 0; r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_sector + sectors_handled; + r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; goto retry_write; } @@@ -1958,14 -1960,14 +1959,14 @@@ static int process_checks(struct r1bio /* fixup the bio for reuse */ bio_reset(b); b->bi_vcnt = vcnt; - b->bi_size = r1_bio->sectors << 9; - b->bi_sector = r1_bio->sector + + b->bi_iter.bi_size = r1_bio->sectors << 9; + b->bi_iter.bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; b->bi_bdev = conf->mirrors[i].rdev->bdev; b->bi_end_io = end_sync_read; b->bi_private = r1_bio; - size = b->bi_size; + size = b->bi_iter.bi_size; for (j = 0; j < vcnt ; j++) { struct bio_vec *bi; bi = &b->bi_io_vec[j]; @@@ -2220,11 -2222,11 +2221,11 @@@ static int narrow_write_error(struct r1 } wbio->bi_rw = WRITE; - wbio->bi_sector = r1_bio->sector; - wbio->bi_size = r1_bio->sectors << 9; + wbio->bi_iter.bi_sector = r1_bio->sector; + wbio->bi_iter.bi_size = r1_bio->sectors << 9; bio_trim(wbio, sector - r1_bio->sector, sectors); - wbio->bi_sector += rdev->data_offset; + wbio->bi_iter.bi_sector += rdev->data_offset; wbio->bi_bdev = rdev->bdev; if (submit_bio_wait(WRITE, wbio) == 0) /* failure! */ @@@ -2338,7 -2340,8 +2339,8 @@@ read_more } r1_bio->read_disk = disk; bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); - bio_trim(bio, r1_bio->sector - bio->bi_sector, max_sectors); + bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector, + max_sectors); r1_bio->bios[r1_bio->read_disk] = bio; rdev = conf->mirrors[disk].rdev; printk_ratelimited(KERN_ERR @@@ -2347,7 -2350,7 +2349,7 @@@ mdname(mddev), (unsigned long long)r1_bio->sector, bdevname(rdev->bdev, b)); - bio->bi_sector = r1_bio->sector + rdev->data_offset; + bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_end_io = raid1_end_read_request; bio->bi_rw = READ | do_sync; @@@ -2356,7 -2359,7 +2358,7 @@@ /* Drat - have to split this up more */ struct bio *mbio = r1_bio->master_bio; int sectors_handled = (r1_bio->sector + max_sectors - - mbio->bi_sector); + - mbio->bi_iter.bi_sector); r1_bio->sectors = max_sectors; spin_lock_irq(&conf->device_lock); if (mbio->bi_phys_segments == 0) @@@ -2374,7 -2377,8 +2376,8 @@@ r1_bio->state = 0; set_bit(R1BIO_ReadError, &r1_bio->state); r1_bio->mddev = mddev; - r1_bio->sector = mbio->bi_sector + sectors_handled; + r1_bio->sector = mbio->bi_iter.bi_sector + + sectors_handled; goto read_more; } else @@@ -2598,7 -2602,7 +2601,7 @@@ static sector_t sync_request(struct mdd } if (bio->bi_end_io) { atomic_inc(&rdev->nr_pending); - bio->bi_sector = sector_nr + rdev->data_offset; + bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_private = r1_bio; } @@@ -2698,7 -2702,7 +2701,7 @@@ continue; /* remove last page from this bio */ bio->bi_vcnt--; - bio->bi_size -= len; + bio->bi_iter.bi_size -= len; bio->bi_flags &= ~(1<< BIO_SEG_VALID); } goto bio_full; diff --combined drivers/md/raid10.c index 8d39d63281b,6d43d88657a..33fc408e5ea --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@@ -1152,14 -1152,12 +1152,12 @@@ static void raid10_unplug(struct blk_pl kfree(plug); } - static void make_request(struct mddev *mddev, struct bio * bio) + static void __make_request(struct mddev *mddev, struct bio *bio) { struct r10conf *conf = mddev->private; struct r10bio *r10_bio; struct bio *read_bio; int i; - sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); - int chunk_sects = chunk_mask + 1; const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_fua = (bio->bi_rw & REQ_FUA); @@@ -1174,88 -1172,27 +1172,27 @@@ int max_sectors; int sectors; - if (unlikely(bio->bi_rw & REQ_FLUSH)) { - md_flush_request(mddev, bio); - return; - } - - /* If this request crosses a chunk boundary, we need to - * split it. This will only happen for 1 PAGE (or less) requests. - */ - if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio) - > chunk_sects - && (conf->geo.near_copies < conf->geo.raid_disks - || conf->prev.near_copies < conf->prev.raid_disks))) { - struct bio_pair *bp; - /* Sanity check -- queue functions should prevent this happening */ - if (bio_segments(bio) > 1) - goto bad_map; - /* This is a one page bio that upper layers - * refuse to split for us, so we need to split it. - */ - bp = bio_split(bio, - chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); - - /* Each of these 'make_request' calls will call 'wait_barrier'. - * If the first succeeds but the second blocks due to the resync - * thread raising the barrier, we will deadlock because the - * IO to the underlying device will be queued in generic_make_request - * and will never complete, so will never reduce nr_pending. - * So increment nr_waiting here so no new raise_barriers will - * succeed, and so the second wait_barrier cannot block. - */ - spin_lock_irq(&conf->resync_lock); - conf->nr_waiting++; - spin_unlock_irq(&conf->resync_lock); - - make_request(mddev, &bp->bio1); - make_request(mddev, &bp->bio2); - - spin_lock_irq(&conf->resync_lock); - conf->nr_waiting--; - wake_up(&conf->wait_barrier); - spin_unlock_irq(&conf->resync_lock); - - bio_pair_release(bp); - return; - bad_map: - printk("md/raid10:%s: make_request bug: can't convert block across chunks" - " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2, - (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2); - - bio_io_error(bio); - return; - } - - md_write_start(mddev, bio); - - /* - * Register the new request and wait if the reconstruction - * thread has put up a bar for new requests. - * Continue immediately if no resync is active currently. - */ - wait_barrier(conf); - sectors = bio_sectors(bio); while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - bio->bi_sector < conf->reshape_progress && - bio->bi_sector + sectors > conf->reshape_progress) { + bio->bi_iter.bi_sector < conf->reshape_progress && + bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { /* IO spans the reshape position. Need to wait for * reshape to pass */ allow_barrier(conf); wait_event(conf->wait_barrier, - conf->reshape_progress <= bio->bi_sector || - conf->reshape_progress >= bio->bi_sector + sectors); + conf->reshape_progress <= bio->bi_iter.bi_sector || + conf->reshape_progress >= bio->bi_iter.bi_sector + + sectors); wait_barrier(conf); } if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && bio_data_dir(bio) == WRITE && (mddev->reshape_backwards - ? (bio->bi_sector < conf->reshape_safe && - bio->bi_sector + sectors > conf->reshape_progress) - : (bio->bi_sector + sectors > conf->reshape_safe && - bio->bi_sector < conf->reshape_progress))) { + ? (bio->bi_iter.bi_sector < conf->reshape_safe && + bio->bi_iter.bi_sector + sectors > conf->reshape_progress) + : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe && + bio->bi_iter.bi_sector < conf->reshape_progress))) { /* Need to update reshape_position in metadata */ mddev->reshape_position = conf->reshape_progress; set_bit(MD_CHANGE_DEVS, &mddev->flags); @@@ -1273,7 -1210,7 +1210,7 @@@ r10_bio->sectors = sectors; r10_bio->mddev = mddev; - r10_bio->sector = bio->bi_sector; + r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; /* We might need to issue multiple reads to different @@@ -1302,13 -1239,13 +1239,13 @@@ read_again slot = r10_bio->read_slot; read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); - bio_trim(read_bio, r10_bio->sector - bio->bi_sector, + bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); r10_bio->devs[slot].bio = read_bio; r10_bio->devs[slot].rdev = rdev; - read_bio->bi_sector = r10_bio->devs[slot].addr + + read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + choose_data_offset(r10_bio, rdev); read_bio->bi_bdev = rdev->bdev; read_bio->bi_end_io = raid10_end_read_request; @@@ -1319,15 -1256,15 +1256,15 @@@ /* Could not read all from this device, so we will * need another r10_bio. */ - sectors_handled = (r10_bio->sectors + max_sectors + sectors_handled = (r10_bio->sector + max_sectors - - bio->bi_sector); + - bio->bi_iter.bi_sector); r10_bio->sectors = max_sectors; spin_lock_irq(&conf->device_lock); if (bio->bi_phys_segments == 0) bio->bi_phys_segments = 2; else bio->bi_phys_segments++; - spin_unlock(&conf->device_lock); + spin_unlock_irq(&conf->device_lock); /* Cannot call generic_make_request directly * as that will be queued in __generic_make_request * and subsequent mempool_alloc might block @@@ -1341,7 -1278,8 +1278,8 @@@ r10_bio->sectors = bio_sectors(bio) - sectors_handled; r10_bio->state = 0; r10_bio->mddev = mddev; - r10_bio->sector = bio->bi_sector + sectors_handled; + r10_bio->sector = bio->bi_iter.bi_sector + + sectors_handled; goto read_again; } else generic_make_request(read_bio); @@@ -1499,7 -1437,8 +1437,8 @@@ retry_write bio->bi_phys_segments++; spin_unlock_irq(&conf->device_lock); } - sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector; + sectors_handled = r10_bio->sector + max_sectors - + bio->bi_iter.bi_sector; atomic_set(&r10_bio->remaining, 1); bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); @@@ -1510,11 -1449,11 +1449,11 @@@ if (r10_bio->devs[i].bio) { struct md_rdev *rdev = conf->mirrors[d].rdev; mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - bio_trim(mbio, r10_bio->sector - bio->bi_sector, + bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); r10_bio->devs[i].bio = mbio; - mbio->bi_sector = (r10_bio->devs[i].addr+ + mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+ choose_data_offset(r10_bio, rdev)); mbio->bi_bdev = rdev->bdev; @@@ -1553,11 -1492,11 +1492,11 @@@ rdev = conf->mirrors[d].rdev; } mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - bio_trim(mbio, r10_bio->sector - bio->bi_sector, + bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); r10_bio->devs[i].repl_bio = mbio; - mbio->bi_sector = (r10_bio->devs[i].addr + + mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr + choose_data_offset( r10_bio, rdev)); mbio->bi_bdev = rdev->bdev; @@@ -1591,11 -1530,57 +1530,57 @@@ r10_bio->sectors = bio_sectors(bio) - sectors_handled; r10_bio->mddev = mddev; - r10_bio->sector = bio->bi_sector + sectors_handled; + r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled; r10_bio->state = 0; goto retry_write; } one_write_done(r10_bio); + } + + static void make_request(struct mddev *mddev, struct bio *bio) + { + struct r10conf *conf = mddev->private; + sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); + int chunk_sects = chunk_mask + 1; + + struct bio *split; + + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); + return; + } + + md_write_start(mddev, bio); + + /* + * Register the new request and wait if the reconstruction + * thread has put up a bar for new requests. + * Continue immediately if no resync is active currently. + */ + wait_barrier(conf); + + do { + + /* + * If this request crosses a chunk boundary, we need to split + * it. + */ + if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + + bio_sectors(bio) > chunk_sects + && (conf->geo.near_copies < conf->geo.raid_disks + || conf->prev.near_copies < + conf->prev.raid_disks))) { + split = bio_split(bio, chunk_sects - + (bio->bi_iter.bi_sector & + (chunk_sects - 1)), + GFP_NOIO, fs_bio_set); + bio_chain(split, bio); + } else { + split = bio; + } + + __make_request(mddev, split); + } while (split != bio); /* In case raid10d snuck in to freeze_array */ wake_up(&conf->wait_barrier); @@@ -2124,10 -2109,10 +2109,10 @@@ static void sync_request_write(struct m bio_reset(tbio); tbio->bi_vcnt = vcnt; - tbio->bi_size = r10_bio->sectors << 9; + tbio->bi_iter.bi_size = r10_bio->sectors << 9; tbio->bi_rw = WRITE; tbio->bi_private = r10_bio; - tbio->bi_sector = r10_bio->devs[i].addr; + tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; for (j=0; j < vcnt ; j++) { tbio->bi_io_vec[j].bv_offset = 0; @@@ -2144,7 -2129,7 +2129,7 @@@ atomic_inc(&r10_bio->remaining); md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); - tbio->bi_sector += conf->mirrors[d].rdev->data_offset; + tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; tbio->bi_bdev = conf->mirrors[d].rdev->bdev; generic_make_request(tbio); } @@@ -2614,8 -2599,8 +2599,8 @@@ static int narrow_write_error(struct r1 sectors = sect_to_write; /* Write at 'sector' for 'sectors' */ wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - bio_trim(wbio, sector - bio->bi_sector, sectors); - wbio->bi_sector = (r10_bio->devs[i].addr+ + bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); + wbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+ choose_data_offset(r10_bio, rdev) + (sector - r10_bio->sector)); wbio->bi_bdev = rdev->bdev; @@@ -2687,10 -2672,10 +2672,10 @@@ read_more (unsigned long long)r10_bio->sector); bio = bio_clone_mddev(r10_bio->master_bio, GFP_NOIO, mddev); - bio_trim(bio, r10_bio->sector - bio->bi_sector, max_sectors); + bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); r10_bio->devs[slot].bio = bio; r10_bio->devs[slot].rdev = rdev; - bio->bi_sector = r10_bio->devs[slot].addr + bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + choose_data_offset(r10_bio, rdev); bio->bi_bdev = rdev->bdev; bio->bi_rw = READ | do_sync; @@@ -2701,7 -2686,7 +2686,7 @@@ struct bio *mbio = r10_bio->master_bio; int sectors_handled = r10_bio->sector + max_sectors - - mbio->bi_sector; + - mbio->bi_iter.bi_sector; r10_bio->sectors = max_sectors; spin_lock_irq(&conf->device_lock); if (mbio->bi_phys_segments == 0) @@@ -2719,7 -2704,7 +2704,7 @@@ set_bit(R10BIO_ReadError, &r10_bio->state); r10_bio->mddev = mddev; - r10_bio->sector = mbio->bi_sector + r10_bio->sector = mbio->bi_iter.bi_sector + sectors_handled; goto read_more; @@@ -3157,7 -3142,8 +3142,8 @@@ static sector_t sync_request(struct mdd bio->bi_end_io = end_sync_read; bio->bi_rw = READ; from_addr = r10_bio->devs[j].addr; - bio->bi_sector = from_addr + rdev->data_offset; + bio->bi_iter.bi_sector = from_addr + + rdev->data_offset; bio->bi_bdev = rdev->bdev; atomic_inc(&rdev->nr_pending); /* and we write to 'i' (if not in_sync) */ @@@ -3181,7 -3167,7 +3167,7 @@@ bio->bi_private = r10_bio; bio->bi_end_io = end_sync_write; bio->bi_rw = WRITE; - bio->bi_sector = to_addr + bio->bi_iter.bi_sector = to_addr + rdev->data_offset; bio->bi_bdev = rdev->bdev; atomic_inc(&r10_bio->remaining); @@@ -3210,7 -3196,8 +3196,8 @@@ bio->bi_private = r10_bio; bio->bi_end_io = end_sync_write; bio->bi_rw = WRITE; - bio->bi_sector = to_addr + rdev->data_offset; + bio->bi_iter.bi_sector = to_addr + + rdev->data_offset; bio->bi_bdev = rdev->bdev; atomic_inc(&r10_bio->remaining); break; @@@ -3218,6 -3205,10 +3205,6 @@@ if (j == conf->copies) { /* Cannot recover, so abort the recovery or * record a bad block */ - put_buf(r10_bio); - if (rb2) - atomic_dec(&rb2->remaining); - r10_bio = rb2; if (any_working) { /* problem is that there are bad blocks * on other device(s) @@@ -3249,10 -3240,6 +3236,10 @@@ mirror->recovery_disabled = mddev->recovery_disabled; } + put_buf(r10_bio); + if (rb2) + atomic_dec(&rb2->remaining); + r10_bio = rb2; break; } } @@@ -3328,7 -3315,7 +3315,7 @@@ bio->bi_private = r10_bio; bio->bi_end_io = end_sync_read; bio->bi_rw = READ; - bio->bi_sector = sector + + bio->bi_iter.bi_sector = sector + conf->mirrors[d].rdev->data_offset; bio->bi_bdev = conf->mirrors[d].rdev->bdev; count++; @@@ -3350,7 -3337,7 +3337,7 @@@ bio->bi_private = r10_bio; bio->bi_end_io = end_sync_write; bio->bi_rw = WRITE; - bio->bi_sector = sector + + bio->bi_iter.bi_sector = sector + conf->mirrors[d].replacement->data_offset; bio->bi_bdev = conf->mirrors[d].replacement->bdev; count++; @@@ -3397,7 -3384,7 +3384,7 @@@ bio2 = bio2->bi_next) { /* remove last page from this bio */ bio2->bi_vcnt--; - bio2->bi_size -= len; + bio2->bi_iter.bi_size -= len; bio2->bi_flags &= ~(1<< BIO_SEG_VALID); } goto bio_full; @@@ -3747,8 -3734,7 +3734,8 @@@ static int run(struct mddev *mddev !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; mddev->degraded++; - if (disk->rdev) + if (disk->rdev && + disk->rdev->saved_raid_disk < 0) conf->fullsync = 1; } disk->recovery_disabled = mddev->recovery_disabled - 1; @@@ -4418,7 -4404,7 +4405,7 @@@ read_more read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev); read_bio->bi_bdev = rdev->bdev; - read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr + read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr + rdev->data_offset); read_bio->bi_private = r10_bio; read_bio->bi_end_io = end_sync_read; @@@ -4426,7 -4412,7 +4413,7 @@@ read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); read_bio->bi_flags |= 1 << BIO_UPTODATE; read_bio->bi_vcnt = 0; - read_bio->bi_size = 0; + read_bio->bi_iter.bi_size = 0; r10_bio->master_bio = read_bio; r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; @@@ -4452,7 -4438,8 +4439,8 @@@ bio_reset(b); b->bi_bdev = rdev2->bdev; - b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; + b->bi_iter.bi_sector = r10_bio->devs[s/2].addr + + rdev2->new_data_offset; b->bi_private = r10_bio; b->bi_end_io = end_reshape_write; b->bi_rw = WRITE; @@@ -4479,7 -4466,7 +4467,7 @@@ bio2 = bio2->bi_next) { /* Remove last page from this bio */ bio2->bi_vcnt--; - bio2->bi_size -= len; + bio2->bi_iter.bi_size -= len; bio2->bi_flags &= ~(1<bi_sector + sectors < sector + STRIPE_SECTORS) + if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS) return bio->bi_next; else return NULL; @@@ -225,7 -225,7 +225,7 @@@ static void return_io(struct bio *retur return_bi = bi->bi_next; bi->bi_next = NULL; - bi->bi_size = 0; + bi->bi_iter.bi_size = 0; trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), bi, 0); bio_endio(bi, 0); @@@ -675,10 -675,8 +675,10 @@@ get_active_stripe(struct r5conf *conf, || !conf->inactive_blocked), *(conf->hash_locks + hash)); conf->inactive_blocked = 0; - } else + } else { init_stripe(sh, sector, previous); + atomic_inc(&sh->count); + } } else { spin_lock(&conf->device_lock); if (atomic_read(&sh->count)) { @@@ -689,19 -687,20 +689,19 @@@ } else { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); - BUG_ON(list_empty(&sh->lru)); + BUG_ON(list_empty(&sh->lru) && + !test_bit(STRIPE_EXPANDING, &sh->state)); list_del_init(&sh->lru); if (sh->group) { sh->group->stripes_cnt--; sh->group = NULL; } } + atomic_inc(&sh->count); spin_unlock(&conf->device_lock); } } while (sh == NULL); - if (sh) - atomic_inc(&sh->count); - spin_unlock_irq(conf->hash_locks + hash); return sh; } @@@ -852,10 -851,10 +852,10 @@@ static void ops_run_io(struct stripe_he bi->bi_rw, i); atomic_inc(&sh->count); if (use_new_offset(conf, sh)) - bi->bi_sector = (sh->sector + bi->bi_iter.bi_sector = (sh->sector + rdev->new_data_offset); else - bi->bi_sector = (sh->sector + bi->bi_iter.bi_sector = (sh->sector + rdev->data_offset); if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) bi->bi_rw |= REQ_NOMERGE; @@@ -863,7 -862,7 +863,7 @@@ bi->bi_vcnt = 1; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; - bi->bi_size = STRIPE_SIZE; + bi->bi_iter.bi_size = STRIPE_SIZE; /* * If this is discard request, set bi_vcnt 0. We don't * want to confuse SCSI because SCSI will replace payload @@@ -899,15 -898,15 +899,15 @@@ rbi->bi_rw, i); atomic_inc(&sh->count); if (use_new_offset(conf, sh)) - rbi->bi_sector = (sh->sector + rbi->bi_iter.bi_sector = (sh->sector + rrdev->new_data_offset); else - rbi->bi_sector = (sh->sector + rbi->bi_iter.bi_sector = (sh->sector + rrdev->data_offset); rbi->bi_vcnt = 1; rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; rbi->bi_io_vec[0].bv_offset = 0; - rbi->bi_size = STRIPE_SIZE; + rbi->bi_iter.bi_size = STRIPE_SIZE; /* * If this is discard request, set bi_vcnt 0. We don't * want to confuse SCSI because SCSI will replace payload @@@ -935,24 -934,24 +935,24 @@@ static struct dma_async_tx_descriptor async_copy_data(int frombio, struct bio *bio, struct page *page, sector_t sector, struct dma_async_tx_descriptor *tx) { - struct bio_vec *bvl; + struct bio_vec bvl; + struct bvec_iter iter; struct page *bio_page; - int i; int page_offset; struct async_submit_ctl submit; enum async_tx_flags flags = 0; - if (bio->bi_sector >= sector) - page_offset = (signed)(bio->bi_sector - sector) * 512; + if (bio->bi_iter.bi_sector >= sector) + page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; else - page_offset = (signed)(sector - bio->bi_sector) * -512; + page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; if (frombio) flags |= ASYNC_TX_FENCE; init_async_submit(&submit, flags, tx, NULL, NULL, NULL); - bio_for_each_segment(bvl, bio, i) { - int len = bvl->bv_len; + bio_for_each_segment(bvl, bio, iter) { + int len = bvl.bv_len; int clen; int b_offset = 0; @@@ -968,8 -967,8 +968,8 @@@ clen = len; if (clen > 0) { - b_offset += bvl->bv_offset; - bio_page = bvl->bv_page; + b_offset += bvl.bv_offset; + bio_page = bvl.bv_page; if (frombio) tx = async_memcpy(page, bio_page, page_offset, b_offset, clen, &submit); @@@ -1012,7 -1011,7 +1012,7 @@@ static void ops_complete_biofill(void * BUG_ON(!dev->read); rbi = dev->read; dev->read = NULL; - while (rbi && rbi->bi_sector < + while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); if (!raid5_dec_bi_active_stripes(rbi)) { @@@ -1048,7 -1047,7 +1048,7 @@@ static void ops_run_biofill(struct stri dev->read = rbi = dev->toread; dev->toread = NULL; spin_unlock_irq(&sh->stripe_lock); - while (rbi && rbi->bi_sector < + while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { tx = async_copy_data(0, rbi, dev->page, dev->sector, tx); @@@ -1390,7 -1389,7 +1390,7 @@@ ops_run_biodrain(struct stripe_head *sh wbi = dev->written = chosen; spin_unlock_irq(&sh->stripe_lock); - while (wbi && wbi->bi_sector < + while (wbi && wbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { if (wbi->bi_rw & REQ_FUA) set_bit(R5_WantFUA, &dev->flags); @@@ -2111,7 -2110,6 +2111,7 @@@ static void raid5_end_write_request(str set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { if (!uptodate) { + set_bit(STRIPE_DEGRADED, &sh->state); set_bit(WriteErrorSeen, &rdev->flags); set_bit(R5_WriteError, &sh->dev[i].flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) @@@ -2615,7 -2613,7 +2615,7 @@@ static int add_stripe_bio(struct stripe int firstwrite=0; pr_debug("adding bi b#%llu to stripe s#%llu\n", - (unsigned long long)bi->bi_sector, + (unsigned long long)bi->bi_iter.bi_sector, (unsigned long long)sh->sector); /* @@@ -2633,12 -2631,12 +2633,12 @@@ firstwrite = 1; } else bip = &sh->dev[dd_idx].toread; - while (*bip && (*bip)->bi_sector < bi->bi_sector) { - if (bio_end_sector(*bip) > bi->bi_sector) + while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { + if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) goto overlap; bip = & (*bip)->bi_next; } - if (*bip && (*bip)->bi_sector < bio_end_sector(bi)) + if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) goto overlap; BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); @@@ -2652,7 -2650,7 +2652,7 @@@ sector_t sector = sh->dev[dd_idx].sector; for (bi=sh->dev[dd_idx].towrite; sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && - bi && bi->bi_sector <= sector; + bi && bi->bi_iter.bi_sector <= sector; bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { if (bio_end_sector(bi) >= sector) sector = bio_end_sector(bi); @@@ -2662,7 -2660,7 +2662,7 @@@ } pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", - (unsigned long long)(*bip)->bi_sector, + (unsigned long long)(*bip)->bi_iter.bi_sector, (unsigned long long)sh->sector, dd_idx); spin_unlock_irq(&sh->stripe_lock); @@@ -2737,7 -2735,7 +2737,7 @@@ handle_failed_stripe(struct r5conf *con if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); - while (bi && bi->bi_sector < + while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); @@@ -2756,7 -2754,7 +2756,7 @@@ bi = sh->dev[i].written; sh->dev[i].written = NULL; if (bi) bitmap_end = 1; - while (bi && bi->bi_sector < + while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); @@@ -2780,7 -2778,7 +2780,7 @@@ spin_unlock_irq(&sh->stripe_lock); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); - while (bi && bi->bi_sector < + while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); @@@ -3004,7 -3002,7 +3004,7 @@@ static void handle_stripe_clean_event(s clear_bit(R5_UPTODATE, &dev->flags); wbi = dev->written; dev->written = NULL; - while (wbi && wbi->bi_sector < + while (wbi && wbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); if (!raid5_dec_bi_active_stripes(wbi)) { @@@ -3610,7 -3608,7 +3610,7 @@@ static void analyse_stripe(struct strip */ set_bit(R5_Insync, &dev->flags); - if (rdev && test_bit(R5_WriteError, &dev->flags)) { + if (test_bit(R5_WriteError, &dev->flags)) { /* This flag does not apply to '.replacement' * only to .rdev, so make sure to check that*/ struct md_rdev *rdev2 = rcu_dereference( @@@ -3623,7 -3621,7 +3623,7 @@@ } else clear_bit(R5_WriteError, &dev->flags); } - if (rdev && test_bit(R5_MadeGood, &dev->flags)) { + if (test_bit(R5_MadeGood, &dev->flags)) { /* This flag does not apply to '.replacement' * only to .rdev, so make sure to check that*/ struct md_rdev *rdev2 = rcu_dereference( @@@ -4096,7 -4094,7 +4096,7 @@@ static int raid5_mergeable_bvec(struct static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) { - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bio_sectors(bio); @@@ -4233,9 -4231,9 +4233,9 @@@ static int chunk_aligned_read(struct md /* * compute position */ - align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, - 0, - &dd_idx, NULL); + align_bi->bi_iter.bi_sector = + raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, + 0, &dd_idx, NULL); end_sector = bio_end_sector(align_bi); rcu_read_lock(); @@@ -4260,7 -4258,8 +4260,8 @@@ align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); if (!bio_fits_rdev(align_bi) || - is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi), + is_badblock(rdev, align_bi->bi_iter.bi_sector, + bio_sectors(align_bi), &first_bad, &bad_sectors)) { /* too big in some way, or has a known bad block */ bio_put(align_bi); @@@ -4269,7 -4268,7 +4270,7 @@@ } /* No reshape active, so we can trust rdev->data_offset */ - align_bi->bi_sector += rdev->data_offset; + align_bi->bi_iter.bi_sector += rdev->data_offset; spin_lock_irq(&conf->device_lock); wait_event_lock_irq(conf->wait_for_stripe, @@@ -4281,7 -4280,7 +4282,7 @@@ if (mddev->gendisk) trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), align_bi, disk_devt(mddev->gendisk), - raid_bio->bi_sector); + raid_bio->bi_iter.bi_sector); generic_make_request(align_bi); return 1; } else { @@@ -4464,8 -4463,8 +4465,8 @@@ static void make_discard_request(struc /* Skip discard while reshape is happening */ return; - logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); - last_sector = bi->bi_sector + (bi->bi_size>>9); + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); + last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ @@@ -4569,7 -4568,7 +4570,7 @@@ static void make_request(struct mddev * return; } - logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bio_end_sector(bi); bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ @@@ -5053,7 -5052,8 +5054,8 @@@ static int retry_aligned_read(struct r int remaining; int handled = 0; - logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + logical_sector = raid_bio->bi_iter.bi_sector & + ~((sector_t)STRIPE_SECTORS-1); sector = raid5_compute_sector(conf, logical_sector, 0, &dd_idx, NULL); last_sector = bio_end_sector(raid_bio); diff --combined drivers/s390/block/xpram.c index 58141f0651f,3e530f9da8c..6969d39f1e2 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@@ -184,25 -184,26 +184,26 @@@ static unsigned long xpram_highest_page static void xpram_make_request(struct request_queue *q, struct bio *bio) { xpram_device_t *xdev = bio->bi_bdev->bd_disk->private_data; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; unsigned int index; unsigned long page_addr; unsigned long bytes; - int i; - if ((bio->bi_sector & 7) != 0 || (bio->bi_size & 4095) != 0) + if ((bio->bi_iter.bi_sector & 7) != 0 || + (bio->bi_iter.bi_size & 4095) != 0) /* Request is not page-aligned. */ goto fail; - if ((bio->bi_size >> 12) > xdev->size) + if ((bio->bi_iter.bi_size >> 12) > xdev->size) /* Request size is no page-aligned. */ goto fail; - if ((bio->bi_sector >> 3) > 0xffffffffU - xdev->offset) + if ((bio->bi_iter.bi_sector >> 3) > 0xffffffffU - xdev->offset) goto fail; - index = (bio->bi_sector >> 3) + xdev->offset; - bio_for_each_segment(bvec, bio, i) { + index = (bio->bi_iter.bi_sector >> 3) + xdev->offset; + bio_for_each_segment(bvec, bio, iter) { page_addr = (unsigned long) - kmap(bvec->bv_page) + bvec->bv_offset; - bytes = bvec->bv_len; + kmap(bvec.bv_page) + bvec.bv_offset; + bytes = bvec.bv_len; if ((page_addr & 4095) != 0 || (bytes & 4095) != 0) /* More paranoia. */ goto fail; @@@ -257,7 -258,6 +258,7 @@@ static int __init xpram_setup_sizes(uns unsigned long mem_needed; unsigned long mem_auto; unsigned long long size; + char *sizes_end; int mem_auto_no; int i; @@@ -276,8 -276,8 +277,8 @@@ mem_auto_no = 0; for (i = 0; i < xpram_devs; i++) { if (sizes[i]) { - size = simple_strtoull(sizes[i], &sizes[i], 0); - switch (sizes[i][0]) { + size = simple_strtoull(sizes[i], &sizes_end, 0); + switch (*sizes_end) { case 'g': case 'G': size <<= 20; diff --combined drivers/scsi/sd.c index 9846c6ab2aa,5c8a3b696a1..470954aba72 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@@ -110,7 -110,7 +110,7 @@@ static int sd_suspend_runtime(struct de static int sd_resume(struct device *); static void sd_rescan(struct device *); static int sd_done(struct scsi_cmnd *); -static int sd_eh_action(struct scsi_cmnd *, unsigned char *, int, int); +static int sd_eh_action(struct scsi_cmnd *, int); static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer); static void scsi_disk_release(struct device *cdev); static void sd_print_sense_hdr(struct scsi_disk *, struct scsi_sense_hdr *); @@@ -801,7 -801,7 +801,7 @@@ static int sd_setup_write_same_cmnd(str if (sdkp->device->no_write_same) return BLKPREP_KILL; - BUG_ON(bio_offset(bio) || bio_iovec(bio)->bv_len != sdp->sector_size); + BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size); sector >>= ilog2(sdp->sector_size) - 9; nr_sectors >>= ilog2(sdp->sector_size) - 9; @@@ -1551,23 -1551,23 +1551,23 @@@ static const struct block_device_operat /** * sd_eh_action - error handling callback * @scmd: sd-issued command that has failed - * @eh_cmnd: The command that was sent during error handling - * @eh_cmnd_len: Length of eh_cmnd in bytes * @eh_disp: The recovery disposition suggested by the midlayer * - * This function is called by the SCSI midlayer upon completion of - * an error handling command (TEST UNIT READY, START STOP UNIT, - * etc.) The command sent to the device by the error handler is - * stored in eh_cmnd. The result of sending the eh command is - * passed in eh_disp. + * This function is called by the SCSI midlayer upon completion of an + * error test command (currently TEST UNIT READY). The result of sending + * the eh command is passed in eh_disp. We're looking for devices that + * fail medium access commands but are OK with non access commands like + * test unit ready (so wrongly see the device as having a successful + * recovery) **/ -static int sd_eh_action(struct scsi_cmnd *scmd, unsigned char *eh_cmnd, - int eh_cmnd_len, int eh_disp) +static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp) { struct scsi_disk *sdkp = scsi_disk(scmd->request->rq_disk); if (!scsi_device_online(scmd->device) || - !scsi_medium_access_command(scmd)) + !scsi_medium_access_command(scmd) || + host_byte(scmd->result) != DID_TIME_OUT || + eh_disp != SUCCESS) return eh_disp; /* @@@ -1577,7 -1577,9 +1577,7 @@@ * process of recovering or has it suffered an internal failure * that prevents access to the storage medium. */ - if (host_byte(scmd->result) == DID_TIME_OUT && eh_disp == SUCCESS && - eh_cmnd_len && eh_cmnd[0] == TEST_UNIT_READY) - sdkp->medium_access_timed_out++; + sdkp->medium_access_timed_out++; /* * If the device keeps failing read/write commands but TEST UNIT @@@ -1626,7 -1628,7 +1626,7 @@@ static unsigned int sd_completed_bytes( end_lba <<= 1; } else { /* be careful ... don't want any overflows */ - u64 factor = scmd->device->sector_size / 512; + unsigned int factor = scmd->device->sector_size / 512; do_div(start_lba, factor); do_div(end_lba, factor); } diff --combined drivers/staging/lustre/lustre/llite/lloop.c index 5338e8d4c50,581ff78be1a..0718905adeb --- a/drivers/staging/lustre/lustre/llite/lloop.c +++ b/drivers/staging/lustre/lustre/llite/lloop.c @@@ -194,10 -194,10 +194,10 @@@ static int do_bio_lustrebacked(struct l struct cl_object *obj = ll_i2info(inode)->lli_clob; pgoff_t offset; int ret; - int i; int rw; obd_count page_count = 0; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; struct bio *bio; ssize_t bytes; @@@ -220,15 -220,15 +220,15 @@@ for (bio = head; bio != NULL; bio = bio->bi_next) { LASSERT(rw == bio->bi_rw); - offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset; - bio_for_each_segment(bvec, bio, i) { - BUG_ON(bvec->bv_offset != 0); - BUG_ON(bvec->bv_len != PAGE_CACHE_SIZE); + offset = (pgoff_t)(bio->bi_iter.bi_sector << 9) + lo->lo_offset; + bio_for_each_segment(bvec, bio, iter) { + BUG_ON(bvec.bv_offset != 0); + BUG_ON(bvec.bv_len != PAGE_CACHE_SIZE); - pages[page_count] = bvec->bv_page; + pages[page_count] = bvec.bv_page; offsets[page_count] = offset; page_count++; - offset += bvec->bv_len; + offset += bvec.bv_len; } LASSERT(page_count <= LLOOP_MAX_SEGMENTS); } @@@ -313,7 -313,8 +313,8 @@@ static unsigned int loop_get_bio(struc bio = &lo->lo_bio; while (*bio && (*bio)->bi_rw == rw) { CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n", - (unsigned long long)(*bio)->bi_sector, (*bio)->bi_size, + (unsigned long long)(*bio)->bi_iter.bi_sector, + (*bio)->bi_iter.bi_size, page_count, (*bio)->bi_vcnt); if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS) break; @@@ -347,7 -348,8 +348,8 @@@ static void loop_make_request(struct re goto err; CDEBUG(D_INFO, "submit bio sector %llu size %u\n", - (unsigned long long)old_bio->bi_sector, old_bio->bi_size); + (unsigned long long)old_bio->bi_iter.bi_sector, + old_bio->bi_iter.bi_size); spin_lock_irq(&lo->lo_lock); inactive = (lo->lo_state != LLOOP_BOUND); @@@ -367,7 -369,7 +369,7 @@@ loop_add_bio(lo, old_bio); return; err: - cfs_bio_io_error(old_bio, old_bio->bi_size); + cfs_bio_io_error(old_bio, old_bio->bi_iter.bi_size); } @@@ -378,7 -380,7 +380,7 @@@ static inline void loop_handle_bio(stru while (bio) { struct bio *tmp = bio->bi_next; bio->bi_next = NULL; - cfs_bio_endio(bio, bio->bi_size, ret); + cfs_bio_endio(bio, bio->bi_iter.bi_size, ret); bio = tmp; } } @@@ -856,8 -858,7 +858,8 @@@ static void lloop_exit(void module_init(lloop_init); module_exit(lloop_exit); -CFS_MODULE_PARM(max_loop, "i", int, 0444, "maximum of lloop_device"); +module_param(max_loop, int, 0444); +MODULE_PARM_DESC(max_loop, "maximum of lloop_device"); MODULE_AUTHOR("Sun Microsystems, Inc. "); MODULE_DESCRIPTION("Lustre virtual block device"); MODULE_LICENSE("GPL"); diff --combined fs/btrfs/inode.c index 514b291b135,7ab0e94ad49..d546d8c3038 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@@ -1577,7 -1577,7 +1577,7 @@@ int btrfs_merge_bio_hook(int rw, struc unsigned long bio_flags) { struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - u64 logical = (u64)bio->bi_sector << 9; + u64 logical = (u64)bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; int ret; @@@ -1585,7 -1585,7 +1585,7 @@@ if (bio_flags & EXTENT_BIO_COMPRESSED) return 0; - length = bio->bi_size; + length = bio->bi_iter.bi_size; map_length = length; ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, NULL, 0); @@@ -4354,12 -4354,8 +4354,12 @@@ static int btrfs_setsize(struct inode * * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ - if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) - inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); + if (newsize != oldsize) { + inode_inc_iversion(inode); + if (!(mask & (ATTR_CTIME | ATTR_MTIME))) + inode->i_ctime = inode->i_mtime = + current_fs_time(inode->i_sb); + } if (newsize > oldsize) { truncate_pagecache(inode, newsize); @@@ -4468,7 -4464,7 +4468,7 @@@ static int btrfs_setattr(struct dentry err = btrfs_dirty_inode(inode); if (!err && attr->ia_valid & ATTR_MODE) - err = btrfs_acl_chmod(inode); + err = posix_acl_chmod(inode, inode->i_mode); } return err; @@@ -6783,17 -6779,16 +6783,16 @@@ unlock_err static void btrfs_endio_direct_read(struct bio *bio, int err) { struct btrfs_dio_private *dip = bio->bi_private; - struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct bio *dio_bio; u32 *csums = (u32 *)dip->csum; - int index = 0; u64 start; + int i; start = dip->logical_offset; - do { + bio_for_each_segment_all(bvec, bio, i) { if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { struct page *page = bvec->bv_page; char *kaddr; @@@ -6809,18 -6804,16 +6808,16 @@@ local_irq_restore(flags); flush_dcache_page(bvec->bv_page); - if (csum != csums[index]) { + if (csum != csums[i]) { btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", btrfs_ino(inode), start, csum, - csums[index]); + csums[i]); err = -EIO; } } start += bvec->bv_len; - bvec++; - index++; - } while (bvec <= bvec_end); + } unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); @@@ -6901,7 -6894,8 +6898,8 @@@ static void btrfs_end_dio_bio(struct bi printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu " "sector %#Lx len %u err no %d\n", btrfs_ino(dip->inode), bio->bi_rw, - (unsigned long long)bio->bi_sector, bio->bi_size, err); + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, err); dip->errors = 1; /* @@@ -6992,7 -6986,7 +6990,7 @@@ static int btrfs_submit_direct_hook(in struct bio *bio; struct bio *orig_bio = dip->orig_bio; struct bio_vec *bvec = orig_bio->bi_io_vec; - u64 start_sector = orig_bio->bi_sector; + u64 start_sector = orig_bio->bi_iter.bi_sector; u64 file_offset = dip->logical_offset; u64 submit_len = 0; u64 map_length; @@@ -7000,7 -6994,7 +6998,7 @@@ int ret = 0; int async_submit = 0; - map_length = orig_bio->bi_size; + map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); if (ret) { @@@ -7008,7 -7002,7 +7006,7 @@@ return -EIO; } - if (map_length >= orig_bio->bi_size) { + if (map_length >= orig_bio->bi_iter.bi_size) { bio = orig_bio; goto submit; } @@@ -7060,7 -7054,7 +7058,7 @@@ bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; - map_length = orig_bio->bi_size; + map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); @@@ -7118,7 -7112,8 +7116,8 @@@ static void btrfs_submit_direct(int rw if (!skip_sum && !write) { csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits; + sum_len = dio_bio->bi_iter.bi_size >> + inode->i_sb->s_blocksize_bits; sum_len *= csum_size; } else { sum_len = 0; @@@ -7133,8 -7128,8 +7132,8 @@@ dip->private = dio_bio->bi_private; dip->inode = inode; dip->logical_offset = file_offset; - dip->bytes = dio_bio->bi_size; - dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; + dip->bytes = dio_bio->bi_iter.bi_size; + dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; io_bio->bi_private = dip; dip->errors = 0; dip->orig_bio = io_bio; @@@ -8653,14 -8648,12 +8652,14 @@@ static const struct inode_operations bt .removexattr = btrfs_removexattr, .permission = btrfs_permission, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, .permission = btrfs_permission, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; @@@ -8730,7 -8723,6 +8729,7 @@@ static const struct inode_operations bt .permission = btrfs_permission, .fiemap = btrfs_fiemap, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_special_inode_operations = { @@@ -8742,7 -8734,6 +8741,7 @@@ .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_symlink_inode_operations = { @@@ -8756,6 -8747,7 +8755,6 @@@ .getxattr = btrfs_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, - .get_acl = btrfs_get_acl, .update_time = btrfs_update_time, }; diff --combined fs/f2fs/data.c index 0ae55872350,a2c8de8ba6c..2261ccdd0b5 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@@ -24,195 -24,6 +24,188 @@@ #include "segment.h" #include +static void f2fs_read_end_io(struct bio *bio, int err) +{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ struct bio_vec *bvec; ++ int i; + - do { ++ bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (unlikely(!uptodate)) { ++ if (!err) { ++ SetPageUptodate(page); ++ } else { + ClearPageUptodate(page); + SetPageError(page); - } else { - SetPageUptodate(page); + } + unlock_page(page); - } while (bvec >= bio->bi_io_vec); - ++ } + bio_put(bio); +} + +static void f2fs_write_end_io(struct bio *bio, int err) +{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct f2fs_sb_info *sbi = F2FS_SB(bvec->bv_page->mapping->host->i_sb); ++ struct f2fs_sb_info *sbi = F2FS_SB(bio->bi_io_vec->bv_page->mapping->host->i_sb); ++ struct bio_vec *bvec; ++ int i; + - do { ++ bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (unlikely(!uptodate)) { ++ if (unlikely(err)) { + SetPageError(page); + set_bit(AS_EIO, &page->mapping->flags); + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + sbi->sb->s_flags |= MS_RDONLY; + } + end_page_writeback(page); + dec_page_count(sbi, F2FS_WRITEBACK); - } while (bvec >= bio->bi_io_vec); ++ } + + if (bio->bi_private) + complete(bio->bi_private); + + if (!get_pages(sbi, F2FS_WRITEBACK) && + !list_empty(&sbi->cp_wait.task_list)) + wake_up(&sbi->cp_wait); + + bio_put(bio); +} + +/* + * Low-level block read/write IO operations. + */ +static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, + int npages, bool is_read) +{ + struct bio *bio; + + /* No failure on bio allocation */ + bio = bio_alloc(GFP_NOIO, npages); + + bio->bi_bdev = sbi->sb->s_bdev; - bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); ++ bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); + bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; + + return bio; +} + +static void __submit_merged_bio(struct f2fs_bio_info *io) +{ + struct f2fs_io_info *fio = &io->fio; + int rw; + + if (!io->bio) + return; + + rw = fio->rw; + + if (is_read_io(rw)) { + trace_f2fs_submit_read_bio(io->sbi->sb, rw, + fio->type, io->bio); + submit_bio(rw, io->bio); + } else { + trace_f2fs_submit_write_bio(io->sbi->sb, rw, + fio->type, io->bio); + /* + * META_FLUSH is only from the checkpoint procedure, and we + * should wait this metadata bio for FS consistency. + */ + if (fio->type == META_FLUSH) { + DECLARE_COMPLETION_ONSTACK(wait); + io->bio->bi_private = &wait; + submit_bio(rw, io->bio); + wait_for_completion(&wait); + } else { + submit_bio(rw, io->bio); + } + } + + io->bio = NULL; +} + +void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, + enum page_type type, int rw) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io; + + io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; + + mutex_lock(&io->io_mutex); + + /* change META to META_FLUSH in the checkpoint procedure */ + if (type >= META_FLUSH) { + io->fio.type = META_FLUSH; + io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; + } + __submit_merged_bio(io); + mutex_unlock(&io->io_mutex); +} + +/* + * Fill the locked page with data located in the block address. + * Return unlocked page. + */ +int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page, + block_t blk_addr, int rw) +{ + struct bio *bio; + + trace_f2fs_submit_page_bio(page, blk_addr, rw); + + /* Allocate a new bio */ + bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw)); + + if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { + bio_put(bio); + f2fs_put_page(page, 1); + return -EFAULT; + } + + submit_bio(rw, bio); + return 0; +} + +void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, + block_t blk_addr, struct f2fs_io_info *fio) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); + struct f2fs_bio_info *io; + bool is_read = is_read_io(fio->rw); + + io = is_read ? &sbi->read_io : &sbi->write_io[btype]; + + verify_block_addr(sbi, blk_addr); + + mutex_lock(&io->io_mutex); + + if (!is_read) + inc_page_count(sbi, F2FS_WRITEBACK); + + if (io->bio && (io->last_block_in_bio != blk_addr - 1 || + io->fio.rw != fio->rw)) + __submit_merged_bio(io); +alloc_new: + if (io->bio == NULL) { + int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + + io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read); + io->fio = *fio; + } + + if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + __submit_merged_bio(io); + goto alloc_new; + } + + io->last_block_in_bio = blk_addr; + + mutex_unlock(&io->io_mutex); + trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr); +} + /* * Lock ordering for the change of data block address: * ->data_page @@@ -226,7 -37,7 +219,7 @@@ static void __set_data_blkaddr(struct d struct page *node_page = dn->node_page; unsigned int ofs_in_node = dn->ofs_in_node; - f2fs_wait_on_page_writeback(node_page, NODE, false); + f2fs_wait_on_page_writeback(node_page, NODE); rn = F2FS_NODE(node_page); @@@ -240,39 -51,19 +233,39 @@@ int reserve_new_block(struct dnode_of_d { struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); - if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return -EPERM; - if (!inc_valid_block_count(sbi, dn->inode, 1)) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) return -ENOSPC; trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); __set_data_blkaddr(dn, NEW_ADDR); dn->data_blkaddr = NEW_ADDR; + mark_inode_dirty(dn->inode); sync_inode_page(dn); return 0; } +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) +{ + bool need_put = dn->inode_page ? false : true; + int err; + + /* if inode_page exists, index should be zero */ + f2fs_bug_on(!need_put && index); + + err = get_dnode_of_data(dn, index, ALLOC_NODE); + if (err) + return err; + + if (dn->data_blkaddr == NULL_ADDR) + err = reserve_new_block(dn); + if (err || need_put) + f2fs_put_dnode(dn); + return err; +} + static int check_extent_cache(struct inode *inode, pgoff_t pgofs, struct buffer_head *bh_result) { @@@ -280,9 -71,6 +273,9 @@@ pgoff_t start_fofs, end_fofs; block_t start_blkaddr; + if (is_inode_flag_set(fi, FI_NO_EXTENT)) + return 0; + read_lock(&fi->ext.ext_lock); if (fi->ext.len == 0) { read_unlock(&fi->ext.ext_lock); @@@ -321,7 -109,6 +314,7 @@@ void update_extent_cache(block_t blk_ad struct f2fs_inode_info *fi = F2FS_I(dn->inode); pgoff_t fofs, start_fofs, end_fofs; block_t start_blkaddr, end_blkaddr; + int need_update = true; f2fs_bug_on(blk_addr == NEW_ADDR); fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + @@@ -330,9 -117,6 +323,9 @@@ /* Update the page address in the parent node */ __set_data_blkaddr(dn, blk_addr); + if (is_inode_flag_set(fi, FI_NO_EXTENT)) + return; + write_lock(&fi->ext.ext_lock); start_fofs = fi->ext.fofs; @@@ -379,21 -163,14 +372,21 @@@ fofs - start_fofs + 1; fi->ext.len -= fofs - start_fofs + 1; } - goto end_update; + } else { + need_update = false; } - write_unlock(&fi->ext.ext_lock); - return; + /* Finally, if the extent is very fragmented, let's drop the cache. */ + if (fi->ext.len < F2FS_MIN_EXTENT_LEN) { + fi->ext.len = 0; + set_inode_flag(fi, FI_NO_EXTENT); + need_update = true; + } end_update: write_unlock(&fi->ext.ext_lock); - sync_inode_page(dn); + if (need_update) + sync_inode_page(dn); + return; } struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) @@@ -419,7 -196,7 +412,7 @@@ return ERR_PTR(-ENOENT); /* By fallocate(), there is no cached page, but with NEW_ADDR */ - if (dn.data_blkaddr == NEW_ADDR) + if (unlikely(dn.data_blkaddr == NEW_ADDR)) return ERR_PTR(-EINVAL); page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); @@@ -431,14 -208,11 +424,14 @@@ return page; } - err = f2fs_readpage(sbi, page, dn.data_blkaddr, + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, sync ? READ_SYNC : READA); + if (err) + return ERR_PTR(err); + if (sync) { wait_on_page_locked(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 0); return ERR_PTR(-EIO); } @@@ -472,7 -246,7 +465,7 @@@ repeat } f2fs_put_dnode(&dn); - if (dn.data_blkaddr == NULL_ADDR) { + if (unlikely(dn.data_blkaddr == NULL_ADDR)) { f2fs_put_page(page, 1); return ERR_PTR(-ENOENT); } @@@ -492,16 -266,16 +485,16 @@@ return page; } - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); if (err) return ERR_PTR(err); lock_page(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } - if (page->mapping != mapping) { + if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } @@@ -512,12 -286,12 +505,12 @@@ * Caller ensures that this data page is never allocated. * A new zero-filled data page is allocated in the page cache. * - * Also, caller should grab and release a mutex by calling mutex_lock_op() and - * mutex_unlock_op(). - * Note that, npage is set only by make_empty_dir. + * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + * Note that, ipage is set only by make_empty_dir. */ struct page *get_new_data_page(struct inode *inode, - struct page *npage, pgoff_t index, bool new_i_size) + struct page *ipage, pgoff_t index, bool new_i_size) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; @@@ -525,16 -299,24 +518,16 @@@ struct dnode_of_data dn; int err; - set_new_dnode(&dn, inode, npage, npage, 0); - err = get_dnode_of_data(&dn, index, ALLOC_NODE); + set_new_dnode(&dn, inode, ipage, NULL, 0); + err = f2fs_reserve_block(&dn, index); if (err) return ERR_PTR(err); - - if (dn.data_blkaddr == NULL_ADDR) { - if (reserve_new_block(&dn)) { - if (!npage) - f2fs_put_dnode(&dn); - return ERR_PTR(-ENOSPC); - } - } - if (!npage) - f2fs_put_dnode(&dn); repeat: page = grab_cache_page(mapping, index); - if (!page) - return ERR_PTR(-ENOMEM); + if (!page) { + err = -ENOMEM; + goto put_err; + } if (PageUptodate(page)) return page; @@@ -543,18 -325,15 +536,18 @@@ zero_user_segment(page, 0, PAGE_CACHE_SIZE); SetPageUptodate(page); } else { - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, + READ_SYNC); if (err) - return ERR_PTR(err); + goto put_err; + lock_page(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); - return ERR_PTR(-EIO); + err = -EIO; + goto put_err; } - if (page->mapping != mapping) { + if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } @@@ -565,187 -344,137 +558,187 @@@ i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); /* Only the directory inode sets new_i_size */ set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); - mark_inode_dirty_sync(inode); } return page; -} - -static void read_end_io(struct bio *bio, int err) -{ - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - if (!err) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } - bio_put(bio); +put_err: + f2fs_put_dnode(&dn); + return ERR_PTR(err); } -/* - * Fill the locked page with data located in the block address. - * Return unlocked page. - */ -int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, - block_t blk_addr, int type) +static int __allocate_data_block(struct dnode_of_data *dn) { - struct block_device *bdev = sbi->sb->s_bdev; - struct bio *bio; + struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_summary sum; + block_t new_blkaddr; + struct node_info ni; + int type; - trace_f2fs_readpage(page, blk_addr, type); + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + return -EPERM; + if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + return -ENOSPC; - down_read(&sbi->bio_sem); + __set_data_blkaddr(dn, NEW_ADDR); + dn->data_blkaddr = NEW_ADDR; - /* Allocate a new bio */ - bio = f2fs_bio_alloc(bdev, 1); + get_node_info(sbi, dn->nid, &ni); + set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - /* Initialize the bio */ - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); - bio->bi_end_io = read_end_io; + type = CURSEG_WARM_DATA; - if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { - bio_put(bio); - up_read(&sbi->bio_sem); - f2fs_put_page(page, 1); - return -EFAULT; - } + allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type); - submit_bio(type, bio); - up_read(&sbi->bio_sem); + /* direct IO doesn't use extent cache to maximize the performance */ + set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); + update_extent_cache(new_blkaddr, dn); + clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); + + dn->data_blkaddr = new_blkaddr; return 0; } /* - * This function should be used by the data read flow only where it - * does not check the "create" flag that indicates block allocation. - * The reason for this special functionality is to exploit VFS readahead - * mechanism. + * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh. + * If original data blocks are allocated, then give them to blockdev. + * Otherwise, + * a. preallocate requested block addresses + * b. do not use extent cache for better performance + * c. give the block addresses to blockdev */ -static int get_data_block_ro(struct inode *inode, sector_t iblock, +static int get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blkbits = inode->i_sb->s_blocksize_bits; unsigned maxblocks = bh_result->b_size >> blkbits; struct dnode_of_data dn; - pgoff_t pgofs; - int err; + int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; + pgoff_t pgofs, end_offset; + int err = 0, ofs = 1; + bool allocated = false; /* Get the page offset from the block offset(iblock) */ pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); - if (check_extent_cache(inode, pgofs, bh_result)) { - trace_f2fs_get_data_block(inode, iblock, bh_result, 0); - return 0; - } + if (check_extent_cache(inode, pgofs, bh_result)) + goto out; + + if (create) + f2fs_lock_op(sbi); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + err = get_dnode_of_data(&dn, pgofs, mode); if (err) { - trace_f2fs_get_data_block(inode, iblock, bh_result, err); - return (err == -ENOENT) ? 0 : err; + if (err == -ENOENT) + err = 0; + goto unlock_out; } + if (dn.data_blkaddr == NEW_ADDR) + goto put_out; - /* It does not support data allocation */ - f2fs_bug_on(create); + if (dn.data_blkaddr != NULL_ADDR) { + map_bh(bh_result, inode->i_sb, dn.data_blkaddr); + } else if (create) { + err = __allocate_data_block(&dn); + if (err) + goto put_out; + allocated = true; + map_bh(bh_result, inode->i_sb, dn.data_blkaddr); + } else { + goto put_out; + } - if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) { - int i; - unsigned int end_offset; + end_offset = IS_INODE(dn.node_page) ? + ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK; + bh_result->b_size = (((size_t)1) << blkbits); + dn.ofs_in_node++; + pgofs++; + +get_next: + if (dn.ofs_in_node >= end_offset) { + if (allocated) + sync_inode_page(&dn); + allocated = false; + f2fs_put_dnode(&dn); - end_offset = IS_INODE(dn.node_page) ? - ADDRS_PER_INODE(F2FS_I(inode)) : - ADDRS_PER_BLOCK; + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, pgofs, mode); + if (err) { + if (err == -ENOENT) + err = 0; + goto unlock_out; + } + if (dn.data_blkaddr == NEW_ADDR) + goto put_out; - clear_buffer_new(bh_result); + end_offset = IS_INODE(dn.node_page) ? + ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK; + } + if (maxblocks > (bh_result->b_size >> blkbits)) { + block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (blkaddr == NULL_ADDR && create) { + err = __allocate_data_block(&dn); + if (err) + goto sync_out; + allocated = true; + blkaddr = dn.data_blkaddr; + } /* Give more consecutive addresses for the read ahead */ - for (i = 0; i < end_offset - dn.ofs_in_node; i++) - if (((datablock_addr(dn.node_page, - dn.ofs_in_node + i)) - != (dn.data_blkaddr + i)) || maxblocks == i) - break; - map_bh(bh_result, inode->i_sb, dn.data_blkaddr); - bh_result->b_size = (i << blkbits); + if (blkaddr == (bh_result->b_blocknr + ofs)) { + ofs++; + dn.ofs_in_node++; + pgofs++; + bh_result->b_size += (((size_t)1) << blkbits); + goto get_next; + } } +sync_out: + if (allocated) + sync_inode_page(&dn); +put_out: f2fs_put_dnode(&dn); - trace_f2fs_get_data_block(inode, iblock, bh_result, 0); - return 0; +unlock_out: + if (create) + f2fs_unlock_op(sbi); +out: + trace_f2fs_get_data_block(inode, iblock, bh_result, err); + return err; } static int f2fs_read_data_page(struct file *file, struct page *page) { - return mpage_readpage(page, get_data_block_ro); + struct inode *inode = page->mapping->host; + int ret; + + /* If the file has inline data, try to read it directlly */ + if (f2fs_has_inline_data(inode)) + ret = f2fs_read_inline_data(inode, page); + else + ret = mpage_readpage(page, get_data_block); + + return ret; } static int f2fs_read_data_pages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro); + struct inode *inode = file->f_mapping->host; + + /* If the file has inline data, skip readpages */ + if (f2fs_has_inline_data(inode)) + return 0; + + return mpage_readpages(mapping, pages, nr_pages, get_data_block); } -int do_write_data_page(struct page *page) +int do_write_data_page(struct page *page, struct f2fs_io_info *fio) { struct inode *inode = page->mapping->host; - block_t old_blk_addr, new_blk_addr; + block_t old_blkaddr, new_blkaddr; struct dnode_of_data dn; int err = 0; @@@ -754,10 -483,10 +747,10 @@@ if (err) return err; - old_blk_addr = dn.data_blkaddr; + old_blkaddr = dn.data_blkaddr; /* This page is already truncated */ - if (old_blk_addr == NULL_ADDR) + if (old_blkaddr == NULL_ADDR) goto out_writepage; set_page_writeback(page); @@@ -766,13 -495,15 +759,13 @@@ * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(old_blk_addr != NEW_ADDR && + if (unlikely(old_blkaddr != NEW_ADDR && !is_cold_data(page) && need_inplace_update(inode))) { - rewrite_data_page(F2FS_SB(inode->i_sb), page, - old_blk_addr); + rewrite_data_page(page, old_blkaddr, fio); } else { - write_data_page(inode, page, &dn, - old_blk_addr, &new_blk_addr); - update_extent_cache(new_blk_addr, &dn); + write_data_page(page, &dn, &new_blkaddr, fio); + update_extent_cache(new_blkaddr, &dn); } out_writepage: f2fs_put_dnode(&dn); @@@ -787,13 -518,9 +780,13 @@@ static int f2fs_write_data_page(struct loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) >> PAGE_CACHE_SHIFT; - unsigned offset; + unsigned offset = 0; bool need_balance_fs = false; int err = 0; + struct f2fs_io_info fio = { + .type = DATA, + .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + }; if (page->index < end_index) goto write; @@@ -813,7 -540,7 +806,7 @@@ zero_user_segment(page, offset, PAGE_CACHE_SIZE); write: - if (sbi->por_doing) { + if (unlikely(sbi->por_doing)) { err = AOP_WRITEPAGE_ACTIVATE; goto redirty_out; } @@@ -822,18 -549,10 +815,18 @@@ if (S_ISDIR(inode->i_mode)) { dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(inode); - err = do_write_data_page(page); + err = do_write_data_page(page, &fio); } else { f2fs_lock_op(sbi); - err = do_write_data_page(page); + + if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) { + err = f2fs_write_inline_data(inode, page, offset); + f2fs_unlock_op(sbi); + goto out; + } else { + err = do_write_data_page(page, &fio); + } + f2fs_unlock_op(sbi); need_balance_fs = true; } @@@ -842,10 -561,8 +835,10 @@@ else if (err) goto redirty_out; - if (wbc->for_reclaim) - f2fs_submit_bio(sbi, DATA, true); + if (wbc->for_reclaim) { + f2fs_submit_merged_bio(sbi, DATA, WRITE); + need_balance_fs = false; + } clear_cold_data(page); out: @@@ -897,8 -614,7 +890,8 @@@ static int f2fs_write_data_pages(struc ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); if (locked) mutex_unlock(&sbi->writepages); - f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); + + f2fs_submit_merged_bio(sbi, DATA, WRITE); remove_dirty_dir_inode(inode); @@@ -919,28 -635,27 +912,28 @@@ static int f2fs_write_begin(struct fil f2fs_balance_fs(sbi); repeat: + err = f2fs_convert_inline_data(inode, pos + len); + if (err) + return err; + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; - f2fs_lock_op(sbi); + if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA) + goto inline_data; + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, ALLOC_NODE); - if (err) - goto err; - - if (dn.data_blkaddr == NULL_ADDR) - err = reserve_new_block(&dn); - - f2fs_put_dnode(&dn); - if (err) - goto err; - + err = f2fs_reserve_block(&dn, index); f2fs_unlock_op(sbi); + if (err) { + f2fs_put_page(page, 1); + return err; + } +inline_data: if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) return 0; @@@ -956,19 -671,15 +949,19 @@@ if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); + if (f2fs_has_inline_data(inode)) + err = f2fs_read_inline_data(inode, page); + else + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, + READ_SYNC); if (err) return err; lock_page(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); return -EIO; } - if (page->mapping != mapping) { + if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } @@@ -977,6 -688,11 +970,6 @@@ out SetPageUptodate(page); clear_cold_data(page); return 0; - -err: - f2fs_unlock_op(sbi); - f2fs_put_page(page, 1); - return err; } static int f2fs_write_end(struct file *file, @@@ -995,43 -711,23 +988,43 @@@ update_inode_page(inode); } - unlock_page(page); - page_cache_release(page); + f2fs_put_page(page, 1); return copied; } +static int check_direct_IO(struct inode *inode, int rw, + const struct iovec *iov, loff_t offset, unsigned long nr_segs) +{ + unsigned blocksize_mask = inode->i_sb->s_blocksize - 1; + int i; + + if (rw == READ) + return 0; + + if (offset & blocksize_mask) + return -EINVAL; + + for (i = 0; i < nr_segs; i++) + if (iov[i].iov_len & blocksize_mask) + return -EINVAL; + return 0; +} + static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - if (rw == WRITE) + /* Let buffer I/O handle the inline data case. */ + if (f2fs_has_inline_data(inode)) + return 0; + + if (check_direct_IO(inode, rw, iov, offset, nr_segs)) return 0; - /* Needs synchronization with the cleaner */ return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - get_data_block_ro); + get_data_block); } static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, @@@ -1060,8 -756,6 +1053,8 @@@ static int f2fs_set_data_page_dirty(str trace_f2fs_set_page_dirty(page, DATA); SetPageUptodate(page); + mark_inode_dirty(inode); + if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); set_dirty_dir_page(inode, page); @@@ -1072,7 -766,7 +1065,7 @@@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { - return generic_block_bmap(mapping, block, get_data_block_ro); + return generic_block_bmap(mapping, block, get_data_block); } const struct address_space_operations f2fs_dblock_aops = { diff --combined fs/gfs2/lops.c index 58f06400b7b,985da945f0b..76693793ced --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@@ -83,7 -83,6 +83,7 @@@ static void maybe_release_space(struct bd->bd_bh->b_data + bi->bi_offset, bi->bi_len); clear_bit(GBF_FULL, &bi->bi_flags); rgd->rd_free_clone = rgd->rd_free; + rgd->rd_extfail_pt = rgd->rd_free; } /** @@@ -273,7 -272,7 +273,7 @@@ static struct bio *gfs2_log_alloc_bio(s nrvecs = max(nrvecs/2, 1U); } - bio->bi_sector = blkno * (sb->s_blocksize >> 9); + bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); bio->bi_bdev = sb->s_bdev; bio->bi_end_io = gfs2_end_log_write; bio->bi_private = sdp; @@@ -589,12 -588,8 +589,12 @@@ static int buf_lo_scan_elements(struct static void gfs2_meta_sync(struct gfs2_glock *gl) { struct address_space *mapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_sbd; int error; + if (mapping == NULL) + mapping = &sdp->sd_aspace; + filemap_fdatawrite(mapping); error = filemap_fdatawait(mapping); diff --combined fs/gfs2/ops_fstype.c index 1e712b566d7,16194da9165..c6872d09561 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@@ -36,7 -36,6 +36,7 @@@ #include "log.h" #include "quota.h" #include "dir.h" +#include "meta_io.h" #include "trace_gfs2.h" #define DO 0 @@@ -63,7 -62,6 +63,7 @@@ static void gfs2_tune_init(struct gfs2_ static struct gfs2_sbd *init_sbd(struct super_block *sb) { struct gfs2_sbd *sdp; + struct address_space *mapping; sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL); if (!sdp) @@@ -99,18 -97,6 +99,18 @@@ init_waitqueue_head(&sdp->sd_quota_wait); INIT_LIST_HEAD(&sdp->sd_trunc_list); spin_lock_init(&sdp->sd_trunc_lock); + spin_lock_init(&sdp->sd_bitmap_lock); + + mapping = &sdp->sd_aspace; + + address_space_init_once(mapping); + mapping->a_ops = &gfs2_meta_aops; + mapping->host = sb->s_bdev->bd_inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->private_data = NULL; + mapping->backing_dev_info = sb->s_bdi; + mapping->writeback_index = 0; spin_lock_init(&sdp->sd_log_lock); atomic_set(&sdp->sd_log_pinned, 0); @@@ -231,14 -217,14 +231,14 @@@ static int gfs2_read_super(struct gfs2_ page = alloc_page(GFP_NOFS); if (unlikely(!page)) - return -ENOBUFS; + return -ENOMEM; ClearPageUptodate(page); ClearPageDirty(page); lock_page(page); bio = bio_alloc(GFP_NOFS, 1); - bio->bi_sector = sector * (sb->s_blocksize >> 9); + bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9); bio->bi_bdev = sb->s_bdev; bio_add_page(bio, page, PAGE_SIZE, 0); @@@ -970,6 -956,40 +970,6 @@@ fail return error; } -static int init_threads(struct gfs2_sbd *sdp, int undo) -{ - struct task_struct *p; - int error = 0; - - if (undo) - goto fail_quotad; - - p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); - if (IS_ERR(p)) { - error = PTR_ERR(p); - fs_err(sdp, "can't start logd thread: %d\n", error); - return error; - } - sdp->sd_logd_process = p; - - p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); - if (IS_ERR(p)) { - error = PTR_ERR(p); - fs_err(sdp, "can't start quotad thread: %d\n", error); - goto fail; - } - sdp->sd_quotad_process = p; - - return 0; - - -fail_quotad: - kthread_stop(sdp->sd_quotad_process); -fail: - kthread_stop(sdp->sd_logd_process); - return error; -} - static const match_table_t nolock_tokens = { { Opt_jid, "jid=%d\n", }, { Opt_err, NULL }, @@@ -1234,11 -1254,15 +1234,11 @@@ static int fill_super(struct super_bloc goto fail_per_node; } - error = init_threads(sdp, DO); - if (error) - goto fail_per_node; - if (!(sb->s_flags & MS_RDONLY)) { error = gfs2_make_fs_rw(sdp); if (error) { fs_err(sdp, "can't make FS RW: %d\n", error); - goto fail_threads; + goto fail_per_node; } } @@@ -1246,6 -1270,8 +1246,6 @@@ gfs2_online_uevent(sdp); return 0; -fail_threads: - init_threads(sdp, UNDO); fail_per_node: init_per_node(sdp, UNDO); fail_inodes: @@@ -1340,18 -1366,8 +1340,18 @@@ static struct dentry *gfs2_mount(struc if (IS_ERR(s)) goto error_bdev; - if (s->s_root) + if (s->s_root) { + /* + * s_umount nests inside bd_mutex during + * __invalidate_device(). blkdev_put() acquires + * bd_mutex and can't be called under s_umount. Drop + * s_umount temporarily. This is safe as we're + * holding an active reference. + */ + up_write(&s->s_umount); blkdev_put(bdev, mode); + down_write(&s->s_umount); + } memset(&args, 0, sizeof(args)); args.ar_quota = GFS2_QUOTA_DEFAULT; diff --combined fs/xfs/xfs_aops.c index a26739451b5,1b19b9cd692..db2cfb067d0 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@@ -407,7 -407,7 +407,7 @@@ xfs_alloc_ioend_bio struct bio *bio = bio_alloc(GFP_NOIO, nvecs); ASSERT(bio->bi_private == NULL); - bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; return bio; } @@@ -1217,7 -1217,7 +1217,7 @@@ __xfs_get_blocks lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, lockmode); } else { - lockmode = xfs_ilock_map_shared(ip); + lockmode = xfs_ilock_data_map_shared(ip); } ASSERT(offset <= mp->m_super->s_maxbytes); diff --combined fs/xfs/xfs_buf.c index 51757113a82,2a941ab623c..9c061ef2b0d --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@@ -445,8 -445,8 +445,8 @@@ _xfs_buf_find numbytes = BBTOB(numblks); /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(numbytes < (1 << btp->bt_sshift))); - ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); + ASSERT(!(numbytes < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); /* * Corrupted block numbers can get through to here, unfortunately, so we @@@ -1240,7 -1240,7 +1240,7 @@@ next_chunk bio = bio_alloc(GFP_NOIO, nr_pages); bio->bi_bdev = bp->b_target->bt_bdev; - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; @@@ -1262,7 -1262,7 +1262,7 @@@ total_nr_pages--; } - if (likely(bio->bi_size)) { + if (likely(bio->bi_iter.bi_size)) { if (xfs_buf_is_vmapped(bp)) { flush_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); @@@ -1593,15 -1593,16 +1593,15 @@@ xfs_free_buftarg kmem_free(btp); } -STATIC int -xfs_setsize_buftarg_flags( +int +xfs_setsize_buftarg( xfs_buftarg_t *btp, unsigned int blocksize, - unsigned int sectorsize, - int verbose) + unsigned int sectorsize) { - btp->bt_bsize = blocksize; - btp->bt_sshift = ffs(sectorsize) - 1; - btp->bt_smask = sectorsize - 1; + /* Set up metadata sector size info */ + btp->bt_meta_sectorsize = sectorsize; + btp->bt_meta_sectormask = sectorsize - 1; if (set_blocksize(btp->bt_bdev, sectorsize)) { char name[BDEVNAME_SIZE]; @@@ -1614,25 -1615,30 +1614,25 @@@ return EINVAL; } + /* Set up device logical sector size mask */ + btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); + btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; + return 0; } /* - * When allocating the initial buffer target we have not yet - * read in the superblock, so don't know what sized sectors - * are being used at this early stage. Play safe. + * When allocating the initial buffer target we have not yet + * read in the superblock, so don't know what sized sectors + * are being used at this early stage. Play safe. */ STATIC int xfs_setsize_buftarg_early( xfs_buftarg_t *btp, struct block_device *bdev) { - return xfs_setsize_buftarg_flags(btp, - PAGE_SIZE, bdev_logical_block_size(bdev), 0); -} - -int -xfs_setsize_buftarg( - xfs_buftarg_t *btp, - unsigned int blocksize, - unsigned int sectorsize) -{ - return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); + return xfs_setsize_buftarg(btp, PAGE_SIZE, + bdev_logical_block_size(bdev)); } xfs_buftarg_t * diff --combined include/linux/ceph/messenger.h index 20ee8b63a96,091fdb600d5..d21f2dba073 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@@ -1,6 -1,7 +1,7 @@@ #ifndef __FS_CEPH_MESSENGER_H #define __FS_CEPH_MESSENGER_H + #include #include #include #include @@@ -60,8 -61,8 +61,8 @@@ struct ceph_messenger u32 global_seq; spinlock_t global_seq_lock; - u32 supported_features; - u32 required_features; + u64 supported_features; + u64 required_features; }; enum ceph_msg_data_type { @@@ -119,8 -120,7 +120,7 @@@ struct ceph_msg_data_cursor #ifdef CONFIG_BLOCK struct { /* bio */ struct bio *bio; /* bio from list */ - unsigned int vector_index; /* vector from bio */ - unsigned int vector_offset; /* bytes from vector */ + struct bvec_iter bvec_iter; }; #endif /* CONFIG_BLOCK */ struct { /* pages */ @@@ -154,9 -154,10 +154,9 @@@ struct ceph_msg struct list_head list_head; /* links for connection lists */ struct kref kref; - bool front_is_vmalloc; bool more_to_follow; bool needs_out_seq; - int front_max; + int front_alloc_len; unsigned long ack_stamp; /* tx: when we were acked */ struct ceph_msgpool *pool; @@@ -191,7 -192,7 +191,7 @@@ struct ceph_connection struct ceph_entity_name peer_name; /* peer name */ - unsigned peer_features; + u64 peer_features; u32 connect_seq; /* identify the most recent connection attempt for this connection, client */ u32 peer_global_seq; /* peer's global seq for this connection */ @@@ -255,8 -256,8 +255,8 @@@ extern void ceph_msgr_flush(void) extern void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr, - u32 supported_features, - u32 required_features, + u64 supported_features, + u64 required_features, bool nocrc); extern void ceph_con_init(struct ceph_connection *con, void *private, diff --combined include/trace/events/f2fs.h index 3b9f28dfc84,bd3ee4fbe7a..67f38faac58 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@@ -16,28 -16,15 +16,28 @@@ { META, "META" }, \ { META_FLUSH, "META_FLUSH" }) -#define show_bio_type(type) \ - __print_symbolic(type, \ - { READ, "READ" }, \ - { READA, "READAHEAD" }, \ - { READ_SYNC, "READ_SYNC" }, \ - { WRITE, "WRITE" }, \ - { WRITE_SYNC, "WRITE_SYNC" }, \ - { WRITE_FLUSH, "WRITE_FLUSH" }, \ - { WRITE_FUA, "WRITE_FUA" }) +#define F2FS_BIO_MASK(t) (t & (READA | WRITE_FLUSH_FUA)) +#define F2FS_BIO_EXTRA_MASK(t) (t & (REQ_META | REQ_PRIO)) + +#define show_bio_type(type) show_bio_base(type), show_bio_extra(type) + +#define show_bio_base(type) \ + __print_symbolic(F2FS_BIO_MASK(type), \ + { READ, "READ" }, \ + { READA, "READAHEAD" }, \ + { READ_SYNC, "READ_SYNC" }, \ + { WRITE, "WRITE" }, \ + { WRITE_SYNC, "WRITE_SYNC" }, \ + { WRITE_FLUSH, "WRITE_FLUSH" }, \ + { WRITE_FUA, "WRITE_FUA" }, \ + { WRITE_FLUSH_FUA, "WRITE_FLUSH_FUA" }) + +#define show_bio_extra(type) \ + __print_symbolic(F2FS_BIO_EXTRA_MASK(type), \ + { REQ_META, "(M)" }, \ + { REQ_PRIO, "(P)" }, \ + { REQ_META | REQ_PRIO, "(MP)" }, \ + { 0, " \b" }) #define show_data_type(type) \ __print_symbolic(type, \ @@@ -434,7 -421,7 +434,7 @@@ TRACE_EVENT(f2fs_truncate_partial_nodes __entry->err) ); -TRACE_EVENT_CONDITION(f2fs_readpage, +TRACE_EVENT_CONDITION(f2fs_submit_page_bio, TP_PROTO(struct page *page, sector_t blkaddr, int type), @@@ -459,7 -446,7 +459,7 @@@ ), TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, " - "blkaddr = 0x%llx, bio_type = %s", + "blkaddr = 0x%llx, bio_type = %s%s", show_dev_ino(__entry), (unsigned long)__entry->index, (unsigned long long)__entry->blkaddr, @@@ -611,54 -598,36 +611,54 @@@ TRACE_EVENT(f2fs_reserve_new_block __entry->ofs_in_node) ); -TRACE_EVENT(f2fs_do_submit_bio, +DECLARE_EVENT_CLASS(f2fs__submit_bio, - TP_PROTO(struct super_block *sb, int btype, bool sync, struct bio *bio), + TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio), - TP_ARGS(sb, btype, sync, bio), + TP_ARGS(sb, rw, type, bio), TP_STRUCT__entry( __field(dev_t, dev) - __field(int, btype) - __field(bool, sync) + __field(int, rw) + __field(int, type) __field(sector_t, sector) __field(unsigned int, size) ), TP_fast_assign( __entry->dev = sb->s_dev; - __entry->btype = btype; - __entry->sync = sync; + __entry->rw = rw; + __entry->type = type; - __entry->sector = bio->bi_sector; - __entry->size = bio->bi_size; + __entry->sector = bio->bi_iter.bi_sector; + __entry->size = bio->bi_iter.bi_size; ), - TP_printk("dev = (%d,%d), type = %s, io = %s, sector = %lld, size = %u", + TP_printk("dev = (%d,%d), %s%s, %s, sector = %lld, size = %u", show_dev(__entry), - show_block_type(__entry->btype), - __entry->sync ? "sync" : "no sync", + show_bio_type(__entry->rw), + show_block_type(__entry->type), (unsigned long long)__entry->sector, __entry->size) ); +DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_write_bio, + + TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio), + + TP_ARGS(sb, rw, type, bio), + + TP_CONDITION(bio) +); + +DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_read_bio, + + TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio), + + TP_ARGS(sb, rw, type, bio), + + TP_CONDITION(bio) +); + DECLARE_EVENT_CLASS(f2fs__page, TP_PROTO(struct page *page, int type), @@@ -705,16 -674,15 +705,16 @@@ DEFINE_EVENT(f2fs__page, f2fs_vm_page_m TP_ARGS(page, type) ); -TRACE_EVENT(f2fs_submit_write_page, +TRACE_EVENT(f2fs_submit_page_mbio, - TP_PROTO(struct page *page, block_t blk_addr, int type), + TP_PROTO(struct page *page, int rw, int type, block_t blk_addr), - TP_ARGS(page, blk_addr, type), + TP_ARGS(page, rw, type, blk_addr), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) + __field(int, rw) __field(int, type) __field(pgoff_t, index) __field(block_t, block) @@@ -723,15 -691,13 +723,15 @@@ TP_fast_assign( __entry->dev = page->mapping->host->i_sb->s_dev; __entry->ino = page->mapping->host->i_ino; + __entry->rw = rw; __entry->type = type; __entry->index = page->index; __entry->block = blk_addr; ), - TP_printk("dev = (%d,%d), ino = %lu, %s, index = %lu, blkaddr = 0x%llx", + TP_printk("dev = (%d,%d), ino = %lu, %s%s, %s, index = %lu, blkaddr = 0x%llx", show_dev_ino(__entry), + show_bio_type(__entry->rw), show_block_type(__entry->type), (unsigned long)__entry->index, (unsigned long long)__entry->block) @@@ -761,29 -727,6 +761,29 @@@ TRACE_EVENT(f2fs_write_checkpoint __entry->msg) ); +TRACE_EVENT(f2fs_issue_discard, + + TP_PROTO(struct super_block *sb, block_t blkstart, block_t blklen), + + TP_ARGS(sb, blkstart, blklen), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(block_t, blkstart) + __field(block_t, blklen) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->blkstart = blkstart; + __entry->blklen = blklen; + ), + + TP_printk("dev = (%d,%d), blkstart = 0x%llx, blklen = 0x%llx", + show_dev(__entry), + (unsigned long long)__entry->blkstart, + (unsigned long long)__entry->blklen) +); #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ diff --combined mm/page_io.c index 7247be6114a,f14eded987f..7c59ef68138 --- a/mm/page_io.c +++ b/mm/page_io.c @@@ -31,13 -31,13 +31,13 @@@ static struct bio *get_swap_bio(gfp_t g bio = bio_alloc(gfp_flags, 1); if (bio) { - bio->bi_sector = map_swap_page(page, &bio->bi_bdev); - bio->bi_sector <<= PAGE_SHIFT - 9; + bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev); + bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; bio->bi_io_vec[0].bv_page = page; bio->bi_io_vec[0].bv_len = PAGE_SIZE; bio->bi_io_vec[0].bv_offset = 0; bio->bi_vcnt = 1; - bio->bi_size = PAGE_SIZE; + bio->bi_iter.bi_size = PAGE_SIZE; bio->bi_end_io = end_io; } return bio; @@@ -62,7 -62,7 +62,7 @@@ void end_swap_bio_write(struct bio *bio printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", imajor(bio->bi_bdev->bd_inode), iminor(bio->bi_bdev->bd_inode), - (unsigned long long)bio->bi_sector); + (unsigned long long)bio->bi_iter.bi_sector); ClearPageReclaim(page); } end_page_writeback(page); @@@ -80,7 -80,7 +80,7 @@@ void end_swap_bio_read(struct bio *bio printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", imajor(bio->bi_bdev->bd_inode), iminor(bio->bi_bdev->bd_inode), - (unsigned long long)bio->bi_sector); + (unsigned long long)bio->bi_iter.bi_sector); goto out; } @@@ -320,8 -320,8 +320,8 @@@ int swap_readpage(struct page *page int ret = 0; struct swap_info_struct *sis = page_swap_info(page); - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(PageUptodate(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageUptodate(page), page); if (frontswap_load(page) == 0) { SetPageUptodate(page); unlock_page(page); diff --combined net/ceph/messenger.c index 2ed1304d22a,18c039b95c2..0e478a0f420 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@@ -15,7 -15,6 +15,7 @@@ #include #include +#include #include #include #include @@@ -778,13 -777,12 +778,12 @@@ static void ceph_msg_data_bio_cursor_in bio = data->bio; BUG_ON(!bio); - BUG_ON(!bio->bi_vcnt); cursor->resid = min(length, data->bio_length); cursor->bio = bio; - cursor->vector_index = 0; - cursor->vector_offset = 0; - cursor->last_piece = length <= bio->bi_io_vec[0].bv_len; + cursor->bvec_iter = bio->bi_iter; + cursor->last_piece = + cursor->resid <= bio_iter_len(bio, cursor->bvec_iter); } static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, @@@ -793,71 -791,63 +792,63 @@@ { struct ceph_msg_data *data = cursor->data; struct bio *bio; - struct bio_vec *bio_vec; - unsigned int index; + struct bio_vec bio_vec; BUG_ON(data->type != CEPH_MSG_DATA_BIO); bio = cursor->bio; BUG_ON(!bio); - index = cursor->vector_index; - BUG_ON(index >= (unsigned int) bio->bi_vcnt); + bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); - bio_vec = &bio->bi_io_vec[index]; - BUG_ON(cursor->vector_offset >= bio_vec->bv_len); - *page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset); + *page_offset = (size_t) bio_vec.bv_offset; BUG_ON(*page_offset >= PAGE_SIZE); if (cursor->last_piece) /* pagelist offset is always 0 */ *length = cursor->resid; else - *length = (size_t) (bio_vec->bv_len - cursor->vector_offset); + *length = (size_t) bio_vec.bv_len; BUG_ON(*length > cursor->resid); BUG_ON(*page_offset + *length > PAGE_SIZE); - return bio_vec->bv_page; + return bio_vec.bv_page; } static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { struct bio *bio; - struct bio_vec *bio_vec; - unsigned int index; + struct bio_vec bio_vec; BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO); bio = cursor->bio; BUG_ON(!bio); - index = cursor->vector_index; - BUG_ON(index >= (unsigned int) bio->bi_vcnt); - bio_vec = &bio->bi_io_vec[index]; + bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); /* Advance the cursor offset */ BUG_ON(cursor->resid < bytes); cursor->resid -= bytes; - cursor->vector_offset += bytes; - if (cursor->vector_offset < bio_vec->bv_len) + + bio_advance_iter(bio, &cursor->bvec_iter, bytes); + + if (bytes < bio_vec.bv_len) return false; /* more bytes to process in this segment */ - BUG_ON(cursor->vector_offset != bio_vec->bv_len); /* Move on to the next segment, and possibly the next bio */ - if (++index == (unsigned int) bio->bi_vcnt) { + if (!cursor->bvec_iter.bi_size) { bio = bio->bi_next; - index = 0; + cursor->bvec_iter = bio->bi_iter; } cursor->bio = bio; - cursor->vector_index = index; - cursor->vector_offset = 0; if (!cursor->last_piece) { BUG_ON(!cursor->resid); BUG_ON(!bio); /* A short read is OK, so use <= rather than == */ - if (cursor->resid <= bio->bi_io_vec[index].bv_len) + if (cursor->resid <= bio_iter_len(bio, cursor->bvec_iter)) cursor->last_piece = true; } @@@ -1866,9 -1856,7 +1857,9 @@@ int ceph_parse_ips(const char *c, cons port = (port * 10) + (*p - '0'); p++; } - if (port > 65535 || port == 0) + if (port == 0) + port = CEPH_MON_PORT; + else if (port > 65535) goto bad; } else { port = CEPH_MON_PORT; @@@ -1948,8 -1936,7 +1939,8 @@@ static int process_connect(struct ceph_ { u64 sup_feat = con->msgr->supported_features; u64 req_feat = con->msgr->required_features; - u64 server_feat = le64_to_cpu(con->in_reply.features); + u64 server_feat = ceph_sanitize_features( + le64_to_cpu(con->in_reply.features)); int ret; dout("process_connect on %p tag %d\n", con, (int)con->in_tag); @@@ -2857,8 -2844,8 +2848,8 @@@ static void con_fault(struct ceph_conne */ void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr, - u32 supported_features, - u32 required_features, + u64 supported_features, + u64 required_features, bool nocrc) { msgr->supported_features = supported_features; @@@ -3130,8 -3117,15 +3121,8 @@@ struct ceph_msg *ceph_msg_new(int type INIT_LIST_HEAD(&m->data); /* front */ - m->front_max = front_len; if (front_len) { - if (front_len > PAGE_CACHE_SIZE) { - m->front.iov_base = __vmalloc(front_len, flags, - PAGE_KERNEL); - m->front_is_vmalloc = true; - } else { - m->front.iov_base = kmalloc(front_len, flags); - } + m->front.iov_base = ceph_kvmalloc(front_len, flags); if (m->front.iov_base == NULL) { dout("ceph_msg_new can't allocate %d bytes\n", front_len); @@@ -3140,7 -3134,7 +3131,7 @@@ } else { m->front.iov_base = NULL; } - m->front.iov_len = front_len; + m->front_alloc_len = m->front.iov_len = front_len; dout("ceph_msg_new %p front %d\n", m, front_len); return m; @@@ -3253,7 -3247,10 +3244,7 @@@ static int ceph_con_in_msg_alloc(struc void ceph_msg_kfree(struct ceph_msg *m) { dout("msg_kfree %p\n", m); - if (m->front_is_vmalloc) - vfree(m->front.iov_base); - else - kfree(m->front.iov_base); + ceph_kvfree(m->front.iov_base); kmem_cache_free(ceph_msg_cache, m); } @@@ -3295,8 -3292,8 +3286,8 @@@ EXPORT_SYMBOL(ceph_msg_last_put) void ceph_msg_dump(struct ceph_msg *msg) { - pr_debug("msg_dump %p (front_max %d length %zd)\n", msg, - msg->front_max, msg->data_length); + pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg, + msg->front_alloc_len, msg->data_length); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true);