]> Pileus Git - ~andy/linux/commitdiff
Merge branch 'for-3.14/core' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 30 Jan 2014 19:19:05 +0000 (11:19 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 30 Jan 2014 19:19:05 +0000 (11:19 -0800)
Pull core block IO changes from Jens Axboe:
 "The major piece in here is the immutable bio_ve series from Kent, the
  rest is fairly minor.  It was supposed to go in last round, but
  various issues pushed it to this release instead.  The pull request
  contains:

   - Various smaller blk-mq fixes from different folks.  Nothing major
     here, just minor fixes and cleanups.

   - Fix for a memory leak in the error path in the block ioctl code
     from Christian Engelmayer.

   - Header export fix from CaiZhiyong.

   - Finally the immutable biovec changes from Kent Overstreet.  This
     enables some nice future work on making arbitrarily sized bios
     possible, and splitting more efficient.  Related fixes to immutable
     bio_vecs:

        - dm-cache immutable fixup from Mike Snitzer.
        - btrfs immutable fixup from Muthu Kumar.

  - bio-integrity fix from Nic Bellinger, which is also going to stable"

* 'for-3.14/core' of git://git.kernel.dk/linux-block: (44 commits)
  xtensa: fixup simdisk driver to work with immutable bio_vecs
  block/blk-mq-cpu.c: use hotcpu_notifier()
  blk-mq: for_each_* macro correctness
  block: Fix memory leak in rw_copy_check_uvector() handling
  bio-integrity: Fix bio_integrity_verify segment start bug
  block: remove unrelated header files and export symbol
  blk-mq: uses page->list incorrectly
  blk-mq: use __smp_call_function_single directly
  btrfs: fix missing increment of bi_remaining
  Revert "block: Warn and free bio if bi_end_io is not set"
  block: Warn and free bio if bi_end_io is not set
  blk-mq: fix initializing request's start time
  block: blk-mq: don't export blk_mq_free_queue()
  block: blk-mq: make blk_sync_queue support mq
  block: blk-mq: support draining mq queue
  dm cache: increment bi_remaining when bi_end_io is restored
  block: fixup for generic bio chaining
  block: Really silence spurious compiler warnings
  block: Silence spurious compiler warnings
  block: Kill bio_pair_split()
  ...

28 files changed:
1  2 
block/blk-throttle.c
drivers/block/rbd.c
drivers/block/xen-blkfront.c
drivers/md/bcache/request.c
drivers/md/dm-bufio.c
drivers/md/dm-cache-policy-mq.c
drivers/md/dm-cache-target.c
drivers/md/dm-delay.c
drivers/md/dm-snap.c
drivers/md/dm-thin.c
drivers/md/dm.c
drivers/md/md.c
drivers/md/raid1.c
drivers/md/raid10.c
drivers/md/raid5.c
drivers/s390/block/xpram.c
drivers/scsi/sd.c
drivers/staging/lustre/lustre/llite/lloop.c
fs/btrfs/inode.c
fs/f2fs/data.c
fs/gfs2/lops.c
fs/gfs2/ops_fstype.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_buf.c
include/linux/ceph/messenger.h
include/trace/events/f2fs.h
mm/page_io.c
net/ceph/messenger.c

diff --combined block/blk-throttle.c
index a760857e6b62609dde239ad74aebe2b5ac2ebaac,20f82003777511798659a467666c3a8f6d4b8cbb..1474c3ab7e72cb85698ffe8bb3687df66729281b
@@@ -877,14 -877,14 +877,14 @@@ static bool tg_with_in_bps_limit(struc
        do_div(tmp, HZ);
        bytes_allowed = tmp;
  
-       if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
+       if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
                if (wait)
                        *wait = 0;
                return 1;
        }
  
        /* Calc approx time to dispatch */
-       extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
+       extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
        jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
  
        if (!jiffy_wait)
@@@ -987,7 -987,7 +987,7 @@@ static void throtl_charge_bio(struct th
        bool rw = bio_data_dir(bio);
  
        /* Charge the bio to the group */
-       tg->bytes_disp[rw] += bio->bi_size;
+       tg->bytes_disp[rw] += bio->bi_iter.bi_size;
        tg->io_disp[rw]++;
  
        /*
         */
        if (!(bio->bi_rw & REQ_THROTTLED)) {
                bio->bi_rw |= REQ_THROTTLED;
-               throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size,
-                                            bio->bi_rw);
+               throtl_update_dispatch_stats(tg_to_blkg(tg),
+                                            bio->bi_iter.bi_size, bio->bi_rw);
        }
  }
  
@@@ -1303,10 -1303,13 +1303,10 @@@ static u64 tg_prfill_cpu_rwstat(struct 
        return __blkg_prfill_rwstat(sf, pd, &rwstat);
  }
  
 -static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css,
 -                             struct cftype *cft, struct seq_file *sf)
 +static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
  {
 -      struct blkcg *blkcg = css_to_blkcg(css);
 -
 -      blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
 -                        cft->private, true);
 +      blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
 +                        &blkcg_policy_throtl, seq_cft(sf)->private, true);
        return 0;
  }
  
@@@ -1332,17 -1335,19 +1332,17 @@@ static u64 tg_prfill_conf_uint(struct s
        return __blkg_prfill_u64(sf, pd, v);
  }
  
 -static int tg_print_conf_u64(struct cgroup_subsys_state *css,
 -                           struct cftype *cft, struct seq_file *sf)
 +static int tg_print_conf_u64(struct seq_file *sf, void *v)
  {
 -      blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64,
 -                        &blkcg_policy_throtl, cft->private, false);
 +      blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
 +                        &blkcg_policy_throtl, seq_cft(sf)->private, false);
        return 0;
  }
  
 -static int tg_print_conf_uint(struct cgroup_subsys_state *css,
 -                            struct cftype *cft, struct seq_file *sf)
 +static int tg_print_conf_uint(struct seq_file *sf, void *v)
  {
 -      blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint,
 -                        &blkcg_policy_throtl, cft->private, false);
 +      blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
 +                        &blkcg_policy_throtl, seq_cft(sf)->private, false);
        return 0;
  }
  
@@@ -1423,40 -1428,40 +1423,40 @@@ static struct cftype throtl_files[] = 
        {
                .name = "throttle.read_bps_device",
                .private = offsetof(struct throtl_grp, bps[READ]),
 -              .read_seq_string = tg_print_conf_u64,
 +              .seq_show = tg_print_conf_u64,
                .write_string = tg_set_conf_u64,
                .max_write_len = 256,
        },
        {
                .name = "throttle.write_bps_device",
                .private = offsetof(struct throtl_grp, bps[WRITE]),
 -              .read_seq_string = tg_print_conf_u64,
 +              .seq_show = tg_print_conf_u64,
                .write_string = tg_set_conf_u64,
                .max_write_len = 256,
        },
        {
                .name = "throttle.read_iops_device",
                .private = offsetof(struct throtl_grp, iops[READ]),
 -              .read_seq_string = tg_print_conf_uint,
 +              .seq_show = tg_print_conf_uint,
                .write_string = tg_set_conf_uint,
                .max_write_len = 256,
        },
        {
                .name = "throttle.write_iops_device",
                .private = offsetof(struct throtl_grp, iops[WRITE]),
 -              .read_seq_string = tg_print_conf_uint,
 +              .seq_show = tg_print_conf_uint,
                .write_string = tg_set_conf_uint,
                .max_write_len = 256,
        },
        {
                .name = "throttle.io_service_bytes",
                .private = offsetof(struct tg_stats_cpu, service_bytes),
 -              .read_seq_string = tg_print_cpu_rwstat,
 +              .seq_show = tg_print_cpu_rwstat,
        },
        {
                .name = "throttle.io_serviced",
                .private = offsetof(struct tg_stats_cpu, serviced),
 -              .read_seq_string = tg_print_cpu_rwstat,
 +              .seq_show = tg_print_cpu_rwstat,
        },
        { }     /* terminate */
  };
@@@ -1503,7 -1508,7 +1503,7 @@@ bool blk_throtl_bio(struct request_queu
        if (tg) {
                if (!tg->has_rules[rw]) {
                        throtl_update_dispatch_stats(tg_to_blkg(tg),
-                                                    bio->bi_size, bio->bi_rw);
+                                       bio->bi_iter.bi_size, bio->bi_rw);
                        goto out_unlock_rcu;
                }
        }
        /* out-of-limit, queue to @tg */
        throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
                   rw == READ ? 'R' : 'W',
-                  tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
+                  tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw],
                   tg->io_disp[rw], tg->iops[rw],
                   sq->nr_queued[READ], sq->nr_queued[WRITE]);
  
diff --combined drivers/block/rbd.c
index 16cab6635163797da9414a27cb8634356d5cd999,3624368b910dd30fe841cb42e2f13a8ec5109bc8..b365e0dfccb66f7c256a9d07d7fd976fba17ae95
@@@ -41,7 -41,6 +41,7 @@@
  #include <linux/fs.h>
  #include <linux/blkdev.h>
  #include <linux/slab.h>
 +#include <linux/idr.h>
  
  #include "rbd_types.h"
  
@@@ -90,9 -89,9 +90,9 @@@ static int atomic_dec_return_safe(atomi
  }
  
  #define RBD_DRV_NAME "rbd"
 -#define RBD_DRV_NAME_LONG "rbd (rados block device)"
  
 -#define RBD_MINORS_PER_MAJOR  256             /* max minors per blkdev */
 +#define RBD_MINORS_PER_MAJOR          256
 +#define RBD_SINGLE_MAJOR_PART_SHIFT   4
  
  #define RBD_SNAP_DEV_NAME_PREFIX      "snap_"
  #define RBD_MAX_SNAP_NAME_LEN \
@@@ -324,7 -323,6 +324,7 @@@ struct rbd_device 
        int                     dev_id;         /* blkdev unique id */
  
        int                     major;          /* blkdev assigned major */
 +      int                     minor;
        struct gendisk          *disk;          /* blkdev's gendisk and rq */
  
        u32                     image_format;   /* Either 1 or 2 */
@@@ -388,17 -386,6 +388,17 @@@ static struct kmem_cache *rbd_img_reque
  static struct kmem_cache      *rbd_obj_request_cache;
  static struct kmem_cache      *rbd_segment_name_cache;
  
 +static int rbd_major;
 +static DEFINE_IDA(rbd_dev_id_ida);
 +
 +/*
 + * Default to false for now, as single-major requires >= 0.75 version of
 + * userspace rbd utility.
 + */
 +static bool single_major = false;
 +module_param(single_major, bool, S_IRUGO);
 +MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
 +
  static int rbd_img_request_submit(struct rbd_img_request *img_request);
  
  static void rbd_dev_device_release(struct device *dev);
@@@ -407,52 -394,18 +407,52 @@@ static ssize_t rbd_add(struct bus_type 
                       size_t count);
  static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
                          size_t count);
 +static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
 +                                  size_t count);
 +static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
 +                                     size_t count);
  static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
  static void rbd_spec_put(struct rbd_spec *spec);
  
 +static int rbd_dev_id_to_minor(int dev_id)
 +{
 +      return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
 +}
 +
 +static int minor_to_rbd_dev_id(int minor)
 +{
 +      return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
 +}
 +
  static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
  static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
 +static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
 +static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
  
  static struct attribute *rbd_bus_attrs[] = {
        &bus_attr_add.attr,
        &bus_attr_remove.attr,
 +      &bus_attr_add_single_major.attr,
 +      &bus_attr_remove_single_major.attr,
        NULL,
  };
 -ATTRIBUTE_GROUPS(rbd_bus);
 +
 +static umode_t rbd_bus_is_visible(struct kobject *kobj,
 +                                struct attribute *attr, int index)
 +{
 +      if (!single_major &&
 +          (attr == &bus_attr_add_single_major.attr ||
 +           attr == &bus_attr_remove_single_major.attr))
 +              return 0;
 +
 +      return attr->mode;
 +}
 +
 +static const struct attribute_group rbd_bus_group = {
 +      .attrs = rbd_bus_attrs,
 +      .is_visible = rbd_bus_is_visible,
 +};
 +__ATTRIBUTE_GROUPS(rbd_bus);
  
  static struct bus_type rbd_bus_type = {
        .name           = "rbd",
@@@ -1088,9 -1041,9 +1088,9 @@@ static const char *rbd_segment_name(str
        name_format = "%s.%012llx";
        if (rbd_dev->image_format == 2)
                name_format = "%s.%016llx";
 -      ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format,
 +      ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
                        rbd_dev->header.object_prefix, segment);
 -      if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
 +      if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
                pr_err("error formatting segment name for #%llu (%d)\n",
                        segment, ret);
                kfree(name);
@@@ -1156,23 -1109,23 +1156,23 @@@ static void bio_chain_put(struct bio *c
   */
  static void zero_bio_chain(struct bio *chain, int start_ofs)
  {
-       struct bio_vec *bv;
+       struct bio_vec bv;
+       struct bvec_iter iter;
        unsigned long flags;
        void *buf;
-       int i;
        int pos = 0;
  
        while (chain) {
-               bio_for_each_segment(bv, chain, i) {
-                       if (pos + bv->bv_len > start_ofs) {
+               bio_for_each_segment(bv, chain, iter) {
+                       if (pos + bv.bv_len > start_ofs) {
                                int remainder = max(start_ofs - pos, 0);
-                               buf = bvec_kmap_irq(bv, &flags);
+                               buf = bvec_kmap_irq(&bv, &flags);
                                memset(buf + remainder, 0,
-                                      bv->bv_len - remainder);
-                               flush_dcache_page(bv->bv_page);
+                                      bv.bv_len - remainder);
+                               flush_dcache_page(bv.bv_page);
                                bvec_kunmap_irq(buf, &flags);
                        }
-                       pos += bv->bv_len;
+                       pos += bv.bv_len;
                }
  
                chain = chain->bi_next;
@@@ -1220,74 -1173,14 +1220,14 @@@ static struct bio *bio_clone_range(stru
                                        unsigned int len,
                                        gfp_t gfpmask)
  {
-       struct bio_vec *bv;
-       unsigned int resid;
-       unsigned short idx;
-       unsigned int voff;
-       unsigned short end_idx;
-       unsigned short vcnt;
        struct bio *bio;
  
-       /* Handle the easy case for the caller */
-       if (!offset && len == bio_src->bi_size)
-               return bio_clone(bio_src, gfpmask);
-       if (WARN_ON_ONCE(!len))
-               return NULL;
-       if (WARN_ON_ONCE(len > bio_src->bi_size))
-               return NULL;
-       if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
-               return NULL;
-       /* Find first affected segment... */
-       resid = offset;
-       bio_for_each_segment(bv, bio_src, idx) {
-               if (resid < bv->bv_len)
-                       break;
-               resid -= bv->bv_len;
-       }
-       voff = resid;
-       /* ...and the last affected segment */
-       resid += len;
-       __bio_for_each_segment(bv, bio_src, end_idx, idx) {
-               if (resid <= bv->bv_len)
-                       break;
-               resid -= bv->bv_len;
-       }
-       vcnt = end_idx - idx + 1;
-       /* Build the clone */
-       bio = bio_alloc(gfpmask, (unsigned int) vcnt);
+       bio = bio_clone(bio_src, gfpmask);
        if (!bio)
                return NULL;    /* ENOMEM */
  
-       bio->bi_bdev = bio_src->bi_bdev;
-       bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
-       bio->bi_rw = bio_src->bi_rw;
-       bio->bi_flags |= 1 << BIO_CLONED;
-       /*
-        * Copy over our part of the bio_vec, then update the first
-        * and last (or only) entries.
-        */
-       memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
-                       vcnt * sizeof (struct bio_vec));
-       bio->bi_io_vec[0].bv_offset += voff;
-       if (vcnt > 1) {
-               bio->bi_io_vec[0].bv_len -= voff;
-               bio->bi_io_vec[vcnt - 1].bv_len = resid;
-       } else {
-               bio->bi_io_vec[0].bv_len = len;
-       }
-       bio->bi_vcnt = vcnt;
-       bio->bi_size = len;
-       bio->bi_idx = 0;
+       bio_advance(bio, offset);
+       bio->bi_iter.bi_size = len;
  
        return bio;
  }
@@@ -1318,7 -1211,7 +1258,7 @@@ static struct bio *bio_chain_clone_rang
  
        /* Build up a chain of clone bios up to the limit */
  
-       if (!bi || off >= bi->bi_size || !len)
+       if (!bi || off >= bi->bi_iter.bi_size || !len)
                return NULL;            /* Nothing to clone */
  
        end = &chain;
                        rbd_warn(NULL, "bio_chain exhausted with %u left", len);
                        goto out_err;   /* EINVAL; ran out of bio's */
                }
-               bi_size = min_t(unsigned int, bi->bi_size - off, len);
+               bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
                bio = bio_clone_range(bi, off, bi_size, gfpmask);
                if (!bio)
                        goto out_err;   /* ENOMEM */
                end = &bio->bi_next;
  
                off += bi_size;
-               if (off == bi->bi_size) {
+               if (off == bi->bi_iter.bi_size) {
                        bi = bi->bi_next;
                        off = 0;
                }
@@@ -1808,8 -1701,11 +1748,8 @@@ static struct ceph_osd_request *rbd_osd
        osd_req->r_callback = rbd_osd_req_callback;
        osd_req->r_priv = obj_request;
  
 -      osd_req->r_oid_len = strlen(obj_request->object_name);
 -      rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
 -      memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
 -
 -      osd_req->r_file_layout = rbd_dev->layout;       /* struct */
 +      osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
 +      ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
  
        return osd_req;
  }
@@@ -1846,8 -1742,11 +1786,8 @@@ rbd_osd_req_create_copyup(struct rbd_ob
        osd_req->r_callback = rbd_osd_req_callback;
        osd_req->r_priv = obj_request;
  
 -      osd_req->r_oid_len = strlen(obj_request->object_name);
 -      rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
 -      memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
 -
 -      osd_req->r_file_layout = rbd_dev->layout;       /* struct */
 +      osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
 +      ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
  
        return osd_req;
  }
@@@ -2227,7 -2126,8 +2167,8 @@@ static int rbd_img_request_fill(struct 
  
        if (type == OBJ_REQUEST_BIO) {
                bio_list = data_desc;
-               rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
+               rbd_assert(img_offset ==
+                          bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
        } else {
                rbd_assert(type == OBJ_REQUEST_PAGES);
                pages = data_desc;
@@@ -2907,7 -2807,7 +2848,7 @@@ static void rbd_watch_cb(u64 ver, u64 n
   * Request sync osd watch/unwatch.  The value of "start" determines
   * whether a watch request is being initiated or torn down.
   */
 -static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
 +static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
  {
        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
        struct rbd_obj_request *obj_request;
@@@ -2982,22 -2882,6 +2923,22 @@@ out_cancel
        return ret;
  }
  
 +static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
 +{
 +      return __rbd_dev_header_watch_sync(rbd_dev, true);
 +}
 +
 +static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
 +{
 +      int ret;
 +
 +      ret = __rbd_dev_header_watch_sync(rbd_dev, false);
 +      if (ret) {
 +              rbd_warn(rbd_dev, "unable to tear down watch request: %d\n",
 +                       ret);
 +      }
 +}
 +
  /*
   * Synchronous osd object method call.  Returns the number of bytes
   * returned in the outbound buffer, or a negative error code.
@@@ -3445,18 -3329,14 +3386,18 @@@ static int rbd_init_disk(struct rbd_dev
        u64 segment_size;
  
        /* create gendisk info */
 -      disk = alloc_disk(RBD_MINORS_PER_MAJOR);
 +      disk = alloc_disk(single_major ?
 +                        (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
 +                        RBD_MINORS_PER_MAJOR);
        if (!disk)
                return -ENOMEM;
  
        snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
                 rbd_dev->dev_id);
        disk->major = rbd_dev->major;
 -      disk->first_minor = 0;
 +      disk->first_minor = rbd_dev->minor;
 +      if (single_major)
 +              disk->flags |= GENHD_FL_EXT_DEVT;
        disk->fops = &rbd_bd_ops;
        disk->private_data = rbd_dev;
  
@@@ -3528,14 -3408,7 +3469,14 @@@ static ssize_t rbd_major_show(struct de
                return sprintf(buf, "%d\n", rbd_dev->major);
  
        return sprintf(buf, "(none)\n");
 +}
  
 +static ssize_t rbd_minor_show(struct device *dev,
 +                            struct device_attribute *attr, char *buf)
 +{
 +      struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 +
 +      return sprintf(buf, "%d\n", rbd_dev->minor);
  }
  
  static ssize_t rbd_client_id_show(struct device *dev,
@@@ -3657,7 -3530,6 +3598,7 @@@ static ssize_t rbd_image_refresh(struc
  static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
  static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
  static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
 +static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
  static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
  static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
  static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
@@@ -3671,7 -3543,6 +3612,7 @@@ static struct attribute *rbd_attrs[] = 
        &dev_attr_size.attr,
        &dev_attr_features.attr,
        &dev_attr_major.attr,
 +      &dev_attr_minor.attr,
        &dev_attr_client_id.attr,
        &dev_attr_pool.attr,
        &dev_attr_pool_id.attr,
@@@ -4442,29 -4313,21 +4383,29 @@@ static void rbd_bus_del_dev(struct rbd_
        device_unregister(&rbd_dev->dev);
  }
  
 -static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
 -
  /*
   * Get a unique rbd identifier for the given new rbd_dev, and add
 - * the rbd_dev to the global list.  The minimum rbd id is 1.
 + * the rbd_dev to the global list.
   */
 -static void rbd_dev_id_get(struct rbd_device *rbd_dev)
 +static int rbd_dev_id_get(struct rbd_device *rbd_dev)
  {
 -      rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
 +      int new_dev_id;
 +
 +      new_dev_id = ida_simple_get(&rbd_dev_id_ida,
 +                                  0, minor_to_rbd_dev_id(1 << MINORBITS),
 +                                  GFP_KERNEL);
 +      if (new_dev_id < 0)
 +              return new_dev_id;
 +
 +      rbd_dev->dev_id = new_dev_id;
  
        spin_lock(&rbd_dev_list_lock);
        list_add_tail(&rbd_dev->node, &rbd_dev_list);
        spin_unlock(&rbd_dev_list_lock);
 -      dout("rbd_dev %p given dev id %llu\n", rbd_dev,
 -              (unsigned long long) rbd_dev->dev_id);
 +
 +      dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id);
 +
 +      return 0;
  }
  
  /*
   */
  static void rbd_dev_id_put(struct rbd_device *rbd_dev)
  {
 -      struct list_head *tmp;
 -      int rbd_id = rbd_dev->dev_id;
 -      int max_id;
 -
 -      rbd_assert(rbd_id > 0);
 -
 -      dout("rbd_dev %p released dev id %llu\n", rbd_dev,
 -              (unsigned long long) rbd_dev->dev_id);
        spin_lock(&rbd_dev_list_lock);
        list_del_init(&rbd_dev->node);
 -
 -      /*
 -       * If the id being "put" is not the current maximum, there
 -       * is nothing special we need to do.
 -       */
 -      if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
 -              spin_unlock(&rbd_dev_list_lock);
 -              return;
 -      }
 -
 -      /*
 -       * We need to update the current maximum id.  Search the
 -       * list to find out what it is.  We're more likely to find
 -       * the maximum at the end, so search the list backward.
 -       */
 -      max_id = 0;
 -      list_for_each_prev(tmp, &rbd_dev_list) {
 -              struct rbd_device *rbd_dev;
 -
 -              rbd_dev = list_entry(tmp, struct rbd_device, node);
 -              if (rbd_dev->dev_id > max_id)
 -                      max_id = rbd_dev->dev_id;
 -      }
        spin_unlock(&rbd_dev_list_lock);
  
 -      /*
 -       * The max id could have been updated by rbd_dev_id_get(), in
 -       * which case it now accurately reflects the new maximum.
 -       * Be careful not to overwrite the maximum value in that
 -       * case.
 -       */
 -      atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
 -      dout("  max dev id has been reset\n");
 +      ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
 +
 +      dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id);
  }
  
  /*
@@@ -4902,29 -4801,20 +4843,29 @@@ static int rbd_dev_device_setup(struct 
  {
        int ret;
  
 -      /* generate unique id: find highest unique id, add one */
 -      rbd_dev_id_get(rbd_dev);
 +      /* Get an id and fill in device name. */
 +
 +      ret = rbd_dev_id_get(rbd_dev);
 +      if (ret)
 +              return ret;
  
 -      /* Fill in the device name, now that we have its id. */
        BUILD_BUG_ON(DEV_NAME_LEN
                        < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
        sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
  
 -      /* Get our block major device number. */
 +      /* Record our major and minor device numbers. */
  
 -      ret = register_blkdev(0, rbd_dev->name);
 -      if (ret < 0)
 -              goto err_out_id;
 -      rbd_dev->major = ret;
 +      if (!single_major) {
 +              ret = register_blkdev(0, rbd_dev->name);
 +              if (ret < 0)
 +                      goto err_out_id;
 +
 +              rbd_dev->major = ret;
 +              rbd_dev->minor = 0;
 +      } else {
 +              rbd_dev->major = rbd_major;
 +              rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
 +      }
  
        /* Set up the blkdev mapping. */
  
@@@ -4956,8 -4846,7 +4897,8 @@@ err_out_mapping
  err_out_disk:
        rbd_free_disk(rbd_dev);
  err_out_blkdev:
 -      unregister_blkdev(rbd_dev->major, rbd_dev->name);
 +      if (!single_major)
 +              unregister_blkdev(rbd_dev->major, rbd_dev->name);
  err_out_id:
        rbd_dev_id_put(rbd_dev);
        rbd_dev_mapping_clear(rbd_dev);
@@@ -5013,6 -4902,7 +4954,6 @@@ static void rbd_dev_image_release(struc
  static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
  {
        int ret;
 -      int tmp;
  
        /*
         * Get the id from the image id object.  Unless there's an
                goto err_out_format;
  
        if (mapping) {
 -              ret = rbd_dev_header_watch_sync(rbd_dev, true);
 +              ret = rbd_dev_header_watch_sync(rbd_dev);
                if (ret)
                        goto out_header_name;
        }
  err_out_probe:
        rbd_dev_unprobe(rbd_dev);
  err_out_watch:
 -      if (mapping) {
 -              tmp = rbd_dev_header_watch_sync(rbd_dev, false);
 -              if (tmp)
 -                      rbd_warn(rbd_dev, "unable to tear down "
 -                                      "watch request (%d)\n", tmp);
 -      }
 +      if (mapping)
 +              rbd_dev_header_unwatch_sync(rbd_dev);
  out_header_name:
        kfree(rbd_dev->header_name);
        rbd_dev->header_name = NULL;
@@@ -5073,9 -4967,9 +5014,9 @@@ err_out_format
        return ret;
  }
  
 -static ssize_t rbd_add(struct bus_type *bus,
 -                     const char *buf,
 -                     size_t count)
 +static ssize_t do_rbd_add(struct bus_type *bus,
 +                        const char *buf,
 +                        size_t count)
  {
        struct rbd_device *rbd_dev = NULL;
        struct ceph_options *ceph_opts = NULL;
  
        rc = rbd_dev_device_setup(rbd_dev);
        if (rc) {
 +              /*
 +               * rbd_dev_header_unwatch_sync() can't be moved into
 +               * rbd_dev_image_release() without refactoring, see
 +               * commit 1f3ef78861ac.
 +               */
 +              rbd_dev_header_unwatch_sync(rbd_dev);
                rbd_dev_image_release(rbd_dev);
                goto err_out_module;
        }
@@@ -5163,23 -5051,6 +5104,23 @@@ err_out_module
        return (ssize_t)rc;
  }
  
 +static ssize_t rbd_add(struct bus_type *bus,
 +                     const char *buf,
 +                     size_t count)
 +{
 +      if (single_major)
 +              return -EINVAL;
 +
 +      return do_rbd_add(bus, buf, count);
 +}
 +
 +static ssize_t rbd_add_single_major(struct bus_type *bus,
 +                                  const char *buf,
 +                                  size_t count)
 +{
 +      return do_rbd_add(bus, buf, count);
 +}
 +
  static void rbd_dev_device_release(struct device *dev)
  {
        struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
        rbd_free_disk(rbd_dev);
        clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
        rbd_dev_mapping_clear(rbd_dev);
 -      unregister_blkdev(rbd_dev->major, rbd_dev->name);
 -      rbd_dev->major = 0;
 +      if (!single_major)
 +              unregister_blkdev(rbd_dev->major, rbd_dev->name);
        rbd_dev_id_put(rbd_dev);
        rbd_dev_mapping_clear(rbd_dev);
  }
@@@ -5219,9 -5090,9 +5160,9 @@@ static void rbd_dev_remove_parent(struc
        }
  }
  
 -static ssize_t rbd_remove(struct bus_type *bus,
 -                        const char *buf,
 -                        size_t count)
 +static ssize_t do_rbd_remove(struct bus_type *bus,
 +                           const char *buf,
 +                           size_t count)
  {
        struct rbd_device *rbd_dev = NULL;
        struct list_head *tmp;
        if (ret < 0 || already)
                return ret;
  
 -      ret = rbd_dev_header_watch_sync(rbd_dev, false);
 -      if (ret)
 -              rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
 -
 +      rbd_dev_header_unwatch_sync(rbd_dev);
        /*
         * flush remaining watch callbacks - these must be complete
         * before the osd_client is shutdown
         */
        dout("%s: flushing notifies", __func__);
        ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
 +
        /*
         * Don't free anything from rbd_dev->disk until after all
         * notifies are completely processed. Otherwise
        return count;
  }
  
 +static ssize_t rbd_remove(struct bus_type *bus,
 +                        const char *buf,
 +                        size_t count)
 +{
 +      if (single_major)
 +              return -EINVAL;
 +
 +      return do_rbd_remove(bus, buf, count);
 +}
 +
 +static ssize_t rbd_remove_single_major(struct bus_type *bus,
 +                                     const char *buf,
 +                                     size_t count)
 +{
 +      return do_rbd_remove(bus, buf, count);
 +}
 +
  /*
   * create control files in sysfs
   * /sys/bus/rbd/...
@@@ -5344,7 -5200,7 +5285,7 @@@ static int rbd_slab_init(void
  
        rbd_assert(!rbd_segment_name_cache);
        rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
 -                                      MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL);
 +                                      CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
        if (rbd_segment_name_cache)
                return 0;
  out_err:
@@@ -5380,45 -5236,24 +5321,45 @@@ static int __init rbd_init(void
  
        if (!libceph_compatible(NULL)) {
                rbd_warn(NULL, "libceph incompatibility (quitting)");
 -
                return -EINVAL;
        }
 +
        rc = rbd_slab_init();
        if (rc)
                return rc;
 +
 +      if (single_major) {
 +              rbd_major = register_blkdev(0, RBD_DRV_NAME);
 +              if (rbd_major < 0) {
 +                      rc = rbd_major;
 +                      goto err_out_slab;
 +              }
 +      }
 +
        rc = rbd_sysfs_init();
        if (rc)
 -              rbd_slab_exit();
 +              goto err_out_blkdev;
 +
 +      if (single_major)
 +              pr_info("loaded (major %d)\n", rbd_major);
        else
 -              pr_info("loaded " RBD_DRV_NAME_LONG "\n");
 +              pr_info("loaded\n");
 +
 +      return 0;
  
 +err_out_blkdev:
 +      if (single_major)
 +              unregister_blkdev(rbd_major, RBD_DRV_NAME);
 +err_out_slab:
 +      rbd_slab_exit();
        return rc;
  }
  
  static void __exit rbd_exit(void)
  {
        rbd_sysfs_cleanup();
 +      if (single_major)
 +              unregister_blkdev(rbd_major, RBD_DRV_NAME);
        rbd_slab_exit();
  }
  
@@@ -5428,8 -5263,9 +5369,8 @@@ module_exit(rbd_exit)
  MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
  MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
  MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
 -MODULE_DESCRIPTION("rados block device");
 -
  /* following authorship retained from original osdblk.c */
  MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
  
 +MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
  MODULE_LICENSE("GPL");
index f9c43f91f03e5de68bff030b663f094e56fc1f9f,26ad7923e3319c802a4ab252001ecdf8785b8d60..8dcfb54f160302e0e1d91c232387f758b2f8e0f6
@@@ -1356,7 -1356,7 +1356,7 @@@ static int blkfront_probe(struct xenbus
                char *type;
                int len;
                /* no unplug has been done: do not hook devices != xen vbds */
 -              if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) {
 +              if (xen_has_pv_and_legacy_disk_devices()) {
                        int major;
  
                        if (!VDEV_IS_EXTENDED(vdevice))
@@@ -1547,7 -1547,7 +1547,7 @@@ static int blkif_recover(struct blkfron
                        for (i = 0; i < pending; i++) {
                                offset = (i * segs * PAGE_SIZE) >> 9;
                                size = min((unsigned int)(segs * PAGE_SIZE) >> 9,
-                                          (unsigned int)(bio->bi_size >> 9) - offset);
+                                          (unsigned int)bio_sectors(bio) - offset);
                                cloned_bio = bio_clone(bio, GFP_NOIO);
                                BUG_ON(cloned_bio == NULL);
                                bio_trim(cloned_bio, offset, size);
@@@ -2079,7 -2079,7 +2079,7 @@@ static int __init xlblk_init(void
        if (!xen_domain())
                return -ENODEV;
  
 -      if (xen_hvm_domain() && !xen_platform_pci_unplug)
 +      if (!xen_has_pv_disk_devices())
                return -ENODEV;
  
        if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
index 61bcfc21d2a0f4972b581a689fd1c3c929f7bd38,5878cdb3952948d78e029f4ed0a343d7a7807cab..c906571997d7a4ab256188f05f4a8c11ea5928f8
@@@ -163,6 -163,7 +163,6 @@@ static struct cgroup_subsys_state *bcac
  static void bcachecg_destroy(struct cgroup *cgroup)
  {
        struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
 -      free_css_id(&bcache_subsys, &cg->css);
        kfree(cg);
  }
  
@@@ -197,14 -198,14 +197,14 @@@ static bool verify(struct cached_dev *d
  
  static void bio_csum(struct bio *bio, struct bkey *k)
  {
-       struct bio_vec *bv;
+       struct bio_vec bv;
+       struct bvec_iter iter;
        uint64_t csum = 0;
-       int i;
  
-       bio_for_each_segment(bv, bio, i) {
-               void *d = kmap(bv->bv_page) + bv->bv_offset;
-               csum = bch_crc64_update(csum, d, bv->bv_len);
-               kunmap(bv->bv_page);
+       bio_for_each_segment(bv, bio, iter) {
+               void *d = kmap(bv.bv_page) + bv.bv_offset;
+               csum = bch_crc64_update(csum, d, bv.bv_len);
+               kunmap(bv.bv_page);
        }
  
        k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
@@@ -260,7 -261,7 +260,7 @@@ static void bch_data_invalidate(struct 
        struct bio *bio = op->bio;
  
        pr_debug("invalidating %i sectors from %llu",
-                bio_sectors(bio), (uint64_t) bio->bi_sector);
+                bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector);
  
        while (bio_sectors(bio)) {
                unsigned sectors = min(bio_sectors(bio),
                if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
                        goto out;
  
-               bio->bi_sector  += sectors;
-               bio->bi_size    -= sectors << 9;
+               bio->bi_iter.bi_sector  += sectors;
+               bio->bi_iter.bi_size    -= sectors << 9;
  
                bch_keylist_add(&op->insert_keys,
-                               &KEY(op->inode, bio->bi_sector, sectors));
+                               &KEY(op->inode, bio->bi_iter.bi_sector, sectors));
        }
  
        op->insert_data_done = true;
@@@ -363,14 -364,14 +363,14 @@@ static void bch_data_insert_start(struc
                k = op->insert_keys.top;
                bkey_init(k);
                SET_KEY_INODE(k, op->inode);
-               SET_KEY_OFFSET(k, bio->bi_sector);
+               SET_KEY_OFFSET(k, bio->bi_iter.bi_sector);
  
                if (!bch_alloc_sectors(op->c, k, bio_sectors(bio),
                                       op->write_point, op->write_prio,
                                       op->writeback))
                        goto err;
  
-               n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
+               n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split);
  
                n->bi_end_io    = bch_data_insert_endio;
                n->bi_private   = cl;
@@@ -521,7 -522,7 +521,7 @@@ static bool check_should_bypass(struct 
             (bio->bi_rw & REQ_WRITE)))
                goto skip;
  
-       if (bio->bi_sector & (c->sb.block_size - 1) ||
+       if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
            bio_sectors(bio) & (c->sb.block_size - 1)) {
                pr_debug("skipping unaligned io");
                goto skip;
  
        spin_lock(&dc->io_lock);
  
-       hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
-               if (i->last == bio->bi_sector &&
+       hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
+               if (i->last == bio->bi_iter.bi_sector &&
                    time_before(jiffies, i->jiffies))
                        goto found;
  
        add_sequential(task);
        i->sequential = 0;
  found:
-       if (i->sequential + bio->bi_size > i->sequential)
-               i->sequential   += bio->bi_size;
+       if (i->sequential + bio->bi_iter.bi_size > i->sequential)
+               i->sequential   += bio->bi_iter.bi_size;
  
        i->last                  = bio_end_sector(bio);
        i->jiffies               = jiffies + msecs_to_jiffies(5000);
@@@ -605,7 -606,6 +605,6 @@@ struct search 
        unsigned                insert_bio_sectors;
  
        unsigned                recoverable:1;
-       unsigned                unaligned_bvec:1;
        unsigned                write:1;
        unsigned                read_dirty_data:1;
  
@@@ -649,15 -649,15 +648,15 @@@ static int cache_lookup_fn(struct btree
        struct bkey *bio_key;
        unsigned ptr;
  
-       if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_sector, 0)) <= 0)
+       if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0)
                return MAP_CONTINUE;
  
        if (KEY_INODE(k) != s->iop.inode ||
-           KEY_START(k) > bio->bi_sector) {
+           KEY_START(k) > bio->bi_iter.bi_sector) {
                unsigned bio_sectors = bio_sectors(bio);
                unsigned sectors = KEY_INODE(k) == s->iop.inode
                        ? min_t(uint64_t, INT_MAX,
-                               KEY_START(k) - bio->bi_sector)
+                               KEY_START(k) - bio->bi_iter.bi_sector)
                        : INT_MAX;
  
                int ret = s->d->cache_miss(b, s, bio, sectors);
        if (KEY_DIRTY(k))
                s->read_dirty_data = true;
  
-       n = bch_bio_split(bio, min_t(uint64_t, INT_MAX,
-                                    KEY_OFFSET(k) - bio->bi_sector),
-                         GFP_NOIO, s->d->bio_split);
+       n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
+                                     KEY_OFFSET(k) - bio->bi_iter.bi_sector),
+                          GFP_NOIO, s->d->bio_split);
  
        bio_key = &container_of(n, struct bbio, bio)->key;
        bch_bkey_copy_single_ptr(bio_key, k, ptr);
  
-       bch_cut_front(&KEY(s->iop.inode, n->bi_sector, 0), bio_key);
+       bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key);
        bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
  
        n->bi_end_io    = bch_cache_read_endio;
@@@ -713,7 -713,7 +712,7 @@@ static void cache_lookup(struct closur
        struct bio *bio = &s->bio.bio;
  
        int ret = bch_btree_map_keys(&s->op, s->iop.c,
-                                    &KEY(s->iop.inode, bio->bi_sector, 0),
+                                    &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
                                     cache_lookup_fn, MAP_END_KEY);
        if (ret == -EAGAIN)
                continue_at(cl, cache_lookup, bcache_wq);
@@@ -758,10 -758,12 +757,12 @@@ static void bio_complete(struct search 
  static void do_bio_hook(struct search *s)
  {
        struct bio *bio = &s->bio.bio;
-       memcpy(bio, s->orig_bio, sizeof(struct bio));
  
+       bio_init(bio);
+       __bio_clone_fast(bio, s->orig_bio);
        bio->bi_end_io          = request_endio;
        bio->bi_private         = &s->cl;
        atomic_set(&bio->bi_cnt, 3);
  }
  
@@@ -773,9 -775,6 +774,6 @@@ static void search_free(struct closure 
        if (s->iop.bio)
                bio_put(s->iop.bio);
  
-       if (s->unaligned_bvec)
-               mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
        closure_debug_destroy(cl);
        mempool_free(s, s->d->c->search);
  }
  static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
  {
        struct search *s;
-       struct bio_vec *bv;
  
        s = mempool_alloc(d->c->search, GFP_NOIO);
        memset(s, 0, offsetof(struct search, iop.insert_keys));
        s->start_time           = jiffies;
        do_bio_hook(s);
  
-       if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
-               bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
-               memcpy(bv, bio_iovec(bio),
-                      sizeof(struct bio_vec) * bio_segments(bio));
-               s->bio.bio.bi_io_vec    = bv;
-               s->unaligned_bvec       = 1;
-       }
        return s;
  }
  
@@@ -849,26 -838,13 +837,13 @@@ static void cached_dev_read_error(struc
  {
        struct search *s = container_of(cl, struct search, cl);
        struct bio *bio = &s->bio.bio;
-       struct bio_vec *bv;
-       int i;
  
        if (s->recoverable) {
                /* Retry from the backing device: */
                trace_bcache_read_retry(s->orig_bio);
  
                s->iop.error = 0;
-               bv = s->bio.bio.bi_io_vec;
                do_bio_hook(s);
-               s->bio.bio.bi_io_vec = bv;
-               if (!s->unaligned_bvec)
-                       bio_for_each_segment(bv, s->orig_bio, i)
-                               bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
-               else
-                       memcpy(s->bio.bio.bi_io_vec,
-                              bio_iovec(s->orig_bio),
-                              sizeof(struct bio_vec) *
-                              bio_segments(s->orig_bio));
  
                /* XXX: invalidate cache */
  
@@@ -893,9 -869,9 +868,9 @@@ static void cached_dev_read_done(struc
  
        if (s->iop.bio) {
                bio_reset(s->iop.bio);
-               s->iop.bio->bi_sector = s->cache_miss->bi_sector;
+               s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector;
                s->iop.bio->bi_bdev = s->cache_miss->bi_bdev;
-               s->iop.bio->bi_size = s->insert_bio_sectors << 9;
+               s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
                bch_bio_map(s->iop.bio, NULL);
  
                bio_copy_data(s->cache_miss, s->iop.bio);
                s->cache_miss = NULL;
        }
  
-       if (verify(dc, &s->bio.bio) && s->recoverable &&
-           !s->unaligned_bvec && !s->read_dirty_data)
+       if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data)
                bch_data_verify(dc, s->orig_bio);
  
        bio_complete(s);
@@@ -945,7 -920,7 +919,7 @@@ static int cached_dev_cache_miss(struc
        struct bio *miss, *cache_bio;
  
        if (s->cache_miss || s->iop.bypass) {
-               miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+               miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
                ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
                goto out_submit;
        }
        s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
  
        s->iop.replace_key = KEY(s->iop.inode,
-                                bio->bi_sector + s->insert_bio_sectors,
+                                bio->bi_iter.bi_sector + s->insert_bio_sectors,
                                 s->insert_bio_sectors);
  
        ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key);
  
        s->iop.replace = true;
  
-       miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+       miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
  
        /* btree_search_recurse()'s btree iterator is no good anymore */
        ret = miss == bio ? MAP_DONE : -EINTR;
        if (!cache_bio)
                goto out_submit;
  
-       cache_bio->bi_sector    = miss->bi_sector;
-       cache_bio->bi_bdev      = miss->bi_bdev;
-       cache_bio->bi_size      = s->insert_bio_sectors << 9;
+       cache_bio->bi_iter.bi_sector    = miss->bi_iter.bi_sector;
+       cache_bio->bi_bdev              = miss->bi_bdev;
+       cache_bio->bi_iter.bi_size      = s->insert_bio_sectors << 9;
  
        cache_bio->bi_end_io    = request_endio;
        cache_bio->bi_private   = &s->cl;
@@@ -1031,7 -1006,7 +1005,7 @@@ static void cached_dev_write(struct cac
  {
        struct closure *cl = &s->cl;
        struct bio *bio = &s->bio.bio;
-       struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0);
+       struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0);
        struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
  
        bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end);
                        closure_bio_submit(flush, cl, s->d);
                }
        } else {
-               s->iop.bio = bio_clone_bioset(bio, GFP_NOIO,
-                                             dc->disk.bio_split);
+               s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
  
                closure_bio_submit(bio, cl, s->d);
        }
@@@ -1126,13 -1100,13 +1099,13 @@@ static void cached_dev_make_request(str
        part_stat_unlock();
  
        bio->bi_bdev = dc->bdev;
-       bio->bi_sector += dc->sb.data_offset;
+       bio->bi_iter.bi_sector += dc->sb.data_offset;
  
        if (cached_dev_get(dc)) {
                s = search_alloc(bio, d);
                trace_bcache_request_start(s->d, bio);
  
-               if (!bio->bi_size) {
+               if (!bio->bi_iter.bi_size) {
                        /*
                         * can't call bch_journal_meta from under
                         * generic_make_request
@@@ -1204,24 -1178,24 +1177,24 @@@ void bch_cached_dev_request_init(struc
  static int flash_dev_cache_miss(struct btree *b, struct search *s,
                                struct bio *bio, unsigned sectors)
  {
-       struct bio_vec *bv;
-       int i;
+       struct bio_vec bv;
+       struct bvec_iter iter;
  
        /* Zero fill bio */
  
-       bio_for_each_segment(bv, bio, i) {
-               unsigned j = min(bv->bv_len >> 9, sectors);
+       bio_for_each_segment(bv, bio, iter) {
+               unsigned j = min(bv.bv_len >> 9, sectors);
  
-               void *p = kmap(bv->bv_page);
-               memset(p + bv->bv_offset, 0, j << 9);
-               kunmap(bv->bv_page);
+               void *p = kmap(bv.bv_page);
+               memset(p + bv.bv_offset, 0, j << 9);
+               kunmap(bv.bv_page);
  
                sectors -= j;
        }
  
-       bio_advance(bio, min(sectors << 9, bio->bi_size));
+       bio_advance(bio, min(sectors << 9, bio->bi_iter.bi_size));
  
-       if (!bio->bi_size)
+       if (!bio->bi_iter.bi_size)
                return MAP_DONE;
  
        return MAP_CONTINUE;
@@@ -1255,7 -1229,7 +1228,7 @@@ static void flash_dev_make_request(stru
  
        trace_bcache_request_start(s->d, bio);
  
-       if (!bio->bi_size) {
+       if (!bio->bi_iter.bi_size) {
                /*
                 * can't call bch_journal_meta from under
                 * generic_make_request
                                      bcache_wq);
        } else if (rw) {
                bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
-                                       &KEY(d->id, bio->bi_sector, 0),
+                                       &KEY(d->id, bio->bi_iter.bi_sector, 0),
                                        &KEY(d->id, bio_end_sector(bio), 0));
  
                s->iop.bypass           = (bio->bi_rw & REQ_DISCARD) != 0;
diff --combined drivers/md/dm-bufio.c
index 9ed42125514b38d560464e4dd3d741038db06858,a1b58a65d8ed849ecef2217e1b93f2a5d3e42ba1..66c5d130c8c24c4f3101ce78296460da4487f38b
@@@ -104,8 -104,6 +104,8 @@@ struct dm_bufio_client 
        struct list_head reserved_buffers;
        unsigned need_reserved_buffers;
  
 +      unsigned minimum_buffers;
 +
        struct hlist_head *cache_hash;
        wait_queue_head_t free_buffer_wait;
  
@@@ -540,7 -538,7 +540,7 @@@ static void use_inline_bio(struct dm_bu
        bio_init(&b->bio);
        b->bio.bi_io_vec = b->bio_vec;
        b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
-       b->bio.bi_sector = block << b->c->sectors_per_block_bits;
+       b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
        b->bio.bi_bdev = b->c->bdev;
        b->bio.bi_end_io = end_io;
  
@@@ -863,8 -861,8 +863,8 @@@ static void __get_memory_limit(struct d
        buffers = dm_bufio_cache_size_per_client >>
                  (c->sectors_per_block_bits + SECTOR_SHIFT);
  
 -      if (buffers < DM_BUFIO_MIN_BUFFERS)
 -              buffers = DM_BUFIO_MIN_BUFFERS;
 +      if (buffers < c->minimum_buffers)
 +              buffers = c->minimum_buffers;
  
        *limit_buffers = buffers;
        *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
@@@ -1352,34 -1350,6 +1352,34 @@@ retry
  }
  EXPORT_SYMBOL_GPL(dm_bufio_release_move);
  
 +/*
 + * Free the given buffer.
 + *
 + * This is just a hint, if the buffer is in use or dirty, this function
 + * does nothing.
 + */
 +void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
 +{
 +      struct dm_buffer *b;
 +
 +      dm_bufio_lock(c);
 +
 +      b = __find(c, block);
 +      if (b && likely(!b->hold_count) && likely(!b->state)) {
 +              __unlink_buffer(b);
 +              __free_buffer_wake(b);
 +      }
 +
 +      dm_bufio_unlock(c);
 +}
 +EXPORT_SYMBOL(dm_bufio_forget);
 +
 +void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
 +{
 +      c->minimum_buffers = n;
 +}
 +EXPORT_SYMBOL(dm_bufio_set_minimum_buffers);
 +
  unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
  {
        return c->block_size;
@@@ -1576,8 -1546,6 +1576,8 @@@ struct dm_bufio_client *dm_bufio_client
        INIT_LIST_HEAD(&c->reserved_buffers);
        c->need_reserved_buffers = reserved_buffers;
  
 +      c->minimum_buffers = DM_BUFIO_MIN_BUFFERS;
 +
        init_waitqueue_head(&c->free_buffer_wait);
        c->async_write_error = 0;
  
index 930e8c3d73e985b1e75769a9894f13ffd32d756a,d13a16865d03ddc4ec418618d3d8fbdb4909ddc8..1e018e986610a57ef9f82a818aa1f70a8c364e30
@@@ -72,7 -72,7 +72,7 @@@ static enum io_pattern iot_pattern(stru
  
  static void iot_update_stats(struct io_tracker *t, struct bio *bio)
  {
-       if (bio->bi_sector == from_oblock(t->last_end_oblock) + 1)
+       if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1)
                t->nr_seq_samples++;
        else {
                /*
@@@ -87,7 -87,7 +87,7 @@@
                t->nr_rand_samples++;
        }
  
-       t->last_end_oblock = to_oblock(bio->bi_sector + bio_sectors(bio) - 1);
+       t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1);
  }
  
  static void iot_check_for_pattern_switch(struct io_tracker *t)
@@@ -287,8 -287,9 +287,8 @@@ static struct entry *alloc_entry(struc
  static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
  {
        struct entry *e = ep->entries + from_cblock(cblock);
 -      list_del(&e->list);
  
 -      INIT_LIST_HEAD(&e->list);
 +      list_del_init(&e->list);
        INIT_HLIST_NODE(&e->hlist);
        ep->nr_allocated++;
  
@@@ -390,10 -391,6 +390,10 @@@ struct mq_policy 
         */
        unsigned promote_threshold;
  
 +      unsigned discard_promote_adjustment;
 +      unsigned read_promote_adjustment;
 +      unsigned write_promote_adjustment;
 +
        /*
         * The hash table allows us to quickly find an entry by origin
         * block.  Both pre_cache and cache entries are in here.
        struct hlist_head *table;
  };
  
 +#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
 +#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
 +#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
 +
  /*----------------------------------------------------------------*/
  
  /*
@@@ -649,21 -642,25 +649,21 @@@ static int demote_cblock(struct mq_poli
   * We bias towards reads, since they can be demoted at no cost if they
   * haven't been dirtied.
   */
 -#define DISCARDED_PROMOTE_THRESHOLD 1
 -#define READ_PROMOTE_THRESHOLD 4
 -#define WRITE_PROMOTE_THRESHOLD 8
 -
  static unsigned adjusted_promote_threshold(struct mq_policy *mq,
                                           bool discarded_oblock, int data_dir)
  {
        if (data_dir == READ)
 -              return mq->promote_threshold + READ_PROMOTE_THRESHOLD;
 +              return mq->promote_threshold + mq->read_promote_adjustment;
  
        if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
                /*
                 * We don't need to do any copying at all, so give this a
                 * very low threshold.
                 */
 -              return DISCARDED_PROMOTE_THRESHOLD;
 +              return mq->discard_promote_adjustment;
        }
  
 -      return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD;
 +      return mq->promote_threshold + mq->write_promote_adjustment;
  }
  
  static bool should_promote(struct mq_policy *mq, struct entry *e,
@@@ -812,7 -809,7 +812,7 @@@ static int no_entry_found(struct mq_pol
                          bool can_migrate, bool discarded_oblock,
                          int data_dir, struct policy_result *result)
  {
 -      if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) {
 +      if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
                if (can_migrate)
                        insert_in_cache(mq, oblock, result);
                else
@@@ -1138,28 -1135,20 +1138,28 @@@ static int mq_set_config_value(struct d
                               const char *key, const char *value)
  {
        struct mq_policy *mq = to_mq_policy(p);
 -      enum io_pattern pattern;
        unsigned long tmp;
  
 -      if (!strcasecmp(key, "random_threshold"))
 -              pattern = PATTERN_RANDOM;
 -      else if (!strcasecmp(key, "sequential_threshold"))
 -              pattern = PATTERN_SEQUENTIAL;
 -      else
 -              return -EINVAL;
 -
        if (kstrtoul(value, 10, &tmp))
                return -EINVAL;
  
 -      mq->tracker.thresholds[pattern] = tmp;
 +      if (!strcasecmp(key, "random_threshold")) {
 +              mq->tracker.thresholds[PATTERN_RANDOM] = tmp;
 +
 +      } else if (!strcasecmp(key, "sequential_threshold")) {
 +              mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp;
 +
 +      } else if (!strcasecmp(key, "discard_promote_adjustment"))
 +              mq->discard_promote_adjustment = tmp;
 +
 +      else if (!strcasecmp(key, "read_promote_adjustment"))
 +              mq->read_promote_adjustment = tmp;
 +
 +      else if (!strcasecmp(key, "write_promote_adjustment"))
 +              mq->write_promote_adjustment = tmp;
 +
 +      else
 +              return -EINVAL;
  
        return 0;
  }
@@@ -1169,16 -1158,9 +1169,16 @@@ static int mq_emit_config_values(struc
        ssize_t sz = 0;
        struct mq_policy *mq = to_mq_policy(p);
  
 -      DMEMIT("4 random_threshold %u sequential_threshold %u",
 +      DMEMIT("10 random_threshold %u "
 +             "sequential_threshold %u "
 +             "discard_promote_adjustment %u "
 +             "read_promote_adjustment %u "
 +             "write_promote_adjustment %u",
               mq->tracker.thresholds[PATTERN_RANDOM],
 -             mq->tracker.thresholds[PATTERN_SEQUENTIAL]);
 +             mq->tracker.thresholds[PATTERN_SEQUENTIAL],
 +             mq->discard_promote_adjustment,
 +             mq->read_promote_adjustment,
 +             mq->write_promote_adjustment);
  
        return 0;
  }
@@@ -1231,9 -1213,6 +1231,9 @@@ static struct dm_cache_policy *mq_creat
        mq->hit_count = 0;
        mq->generation = 0;
        mq->promote_threshold = 0;
 +      mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
 +      mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
 +      mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
        mutex_init(&mq->lock);
        spin_lock_init(&mq->tick_lock);
  
@@@ -1265,7 -1244,7 +1265,7 @@@ bad_pre_cache_init
  
  static struct dm_cache_policy_type mq_policy_type = {
        .name = "mq",
 -      .version = {1, 1, 0},
 +      .version = {1, 2, 0},
        .hint_size = 4,
        .owner = THIS_MODULE,
        .create = mq_create
  
  static struct dm_cache_policy_type default_policy_type = {
        .name = "default",
 -      .version = {1, 1, 0},
 +      .version = {1, 2, 0},
        .hint_size = 4,
        .owner = THIS_MODULE,
 -      .create = mq_create
 +      .create = mq_create,
 +      .real = &mq_policy_type
  };
  
  static int __init mq_init(void)
index 09334c275c79e91c7bf4fd41e18e641b2196073a,99f91628a33aa6b6969d2b87b61a861e85d16de6..ffd472e015caa918facaed4f65a621c0f61e58a9
@@@ -85,6 -85,12 +85,12 @@@ static void dm_unhook_bio(struct dm_hoo
  {
        bio->bi_end_io = h->bi_end_io;
        bio->bi_private = h->bi_private;
+       /*
+        * Must bump bi_remaining to allow bio to complete with
+        * restored bi_end_io.
+        */
+       atomic_inc(&bio->bi_remaining);
  }
  
  /*----------------------------------------------------------------*/
@@@ -664,15 -670,17 +670,17 @@@ static void remap_to_origin(struct cach
  static void remap_to_cache(struct cache *cache, struct bio *bio,
                           dm_cblock_t cblock)
  {
-       sector_t bi_sector = bio->bi_sector;
+       sector_t bi_sector = bio->bi_iter.bi_sector;
  
        bio->bi_bdev = cache->cache_dev->bdev;
        if (!block_size_is_power_of_two(cache))
-               bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
-                               sector_div(bi_sector, cache->sectors_per_block);
+               bio->bi_iter.bi_sector =
+                       (from_cblock(cblock) * cache->sectors_per_block) +
+                       sector_div(bi_sector, cache->sectors_per_block);
        else
-               bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
-                               (bi_sector & (cache->sectors_per_block - 1));
+               bio->bi_iter.bi_sector =
+                       (from_cblock(cblock) << cache->sectors_per_block_shift) |
+                       (bi_sector & (cache->sectors_per_block - 1));
  }
  
  static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
@@@ -712,7 -720,7 +720,7 @@@ static void remap_to_cache_dirty(struc
  
  static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
  {
-       sector_t block_nr = bio->bi_sector;
+       sector_t block_nr = bio->bi_iter.bi_sector;
  
        if (!block_size_is_power_of_two(cache))
                (void) sector_div(block_nr, cache->sectors_per_block);
@@@ -1027,7 -1035,7 +1035,7 @@@ static void issue_overwrite(struct dm_c
  static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
  {
        return (bio_data_dir(bio) == WRITE) &&
-               (bio->bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
+               (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
  }
  
  static void avoid_copy(struct dm_cache_migration *mg)
@@@ -1252,7 -1260,7 +1260,7 @@@ static void process_flush_bio(struct ca
        size_t pb_data_size = get_per_bio_data_size(cache);
        struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
  
-       BUG_ON(bio->bi_size);
+       BUG_ON(bio->bi_iter.bi_size);
        if (!pb->req_nr)
                remap_to_origin(cache, bio);
        else
   */
  static void process_discard_bio(struct cache *cache, struct bio *bio)
  {
-       dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
+       dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector,
                                                  cache->discard_block_size);
-       dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
+       dm_block_t end_block = bio_end_sector(bio);
        dm_block_t b;
  
        end_block = block_div(end_block, cache->discard_block_size);
@@@ -2826,13 -2834,12 +2834,13 @@@ static void cache_resume(struct dm_targ
  /*
   * Status format:
   *
 - * <#used metadata blocks>/<#total metadata blocks>
 + * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
 + * <cache block size> <#used cache blocks>/<#total cache blocks>
   * <#read hits> <#read misses> <#write hits> <#write misses>
 - * <#demotions> <#promotions> <#blocks in cache> <#dirty>
 + * <#demotions> <#promotions> <#dirty>
   * <#features> <features>*
   * <#core args> <core args>
 - * <#policy args> <policy args>*
 + * <policy name> <#policy args> <policy args>*
   */
  static void cache_status(struct dm_target *ti, status_type_t type,
                         unsigned status_flags, char *result, unsigned maxlen)
  
                residency = policy_residency(cache->policy);
  
 -              DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
 +              DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %llu ",
 +                     (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
                       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
                       (unsigned long long)nr_blocks_metadata,
 +                     cache->sectors_per_block,
 +                     (unsigned long long) from_cblock(residency),
 +                     (unsigned long long) from_cblock(cache->cache_size),
                       (unsigned) atomic_read(&cache->stats.read_hit),
                       (unsigned) atomic_read(&cache->stats.read_miss),
                       (unsigned) atomic_read(&cache->stats.write_hit),
                       (unsigned) atomic_read(&cache->stats.write_miss),
                       (unsigned) atomic_read(&cache->stats.demotion),
                       (unsigned) atomic_read(&cache->stats.promotion),
 -                     (unsigned long long) from_cblock(residency),
 -                     cache->nr_dirty);
 +                     (unsigned long long) from_cblock(cache->nr_dirty));
  
                if (writethrough_mode(&cache->features))
                        DMEMIT("1 writethrough ");
                }
  
                DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
 +
 +              DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
                if (sz < maxlen) {
                        r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
                        if (r)
@@@ -3135,7 -3137,7 +3143,7 @@@ static void cache_io_hints(struct dm_ta
  
  static struct target_type cache_target = {
        .name = "cache",
 -      .version = {1, 2, 0},
 +      .version = {1, 3, 0},
        .module = THIS_MODULE,
        .ctr = cache_ctr,
        .dtr = cache_dtr,
diff --combined drivers/md/dm-delay.c
index a8a511c053a5d5fda6574933e616719256768d31,fc8482a65dd27d0083e1c0f226c619c4cc3142f8..42c3a27a14cc3a906b5f892a6206de348b6b58ee
@@@ -24,6 -24,7 +24,6 @@@ struct delay_c 
        struct work_struct flush_expired_bios;
        struct list_head delayed_bios;
        atomic_t may_delay;
 -      mempool_t *delayed_pool;
  
        struct dm_dev *dev_read;
        sector_t start_read;
  struct dm_delay_info {
        struct delay_c *context;
        struct list_head list;
 -      struct bio *bio;
        unsigned long expires;
  };
  
  static DEFINE_MUTEX(delayed_bios_lock);
  
 -static struct kmem_cache *delayed_cache;
 -
  static void handle_delayed_timer(unsigned long data)
  {
        struct delay_c *dc = (struct delay_c *)data;
@@@ -83,14 -87,13 +83,14 @@@ static struct bio *flush_delayed_bios(s
        mutex_lock(&delayed_bios_lock);
        list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
                if (flush_all || time_after_eq(jiffies, delayed->expires)) {
 +                      struct bio *bio = dm_bio_from_per_bio_data(delayed,
 +                                              sizeof(struct dm_delay_info));
                        list_del(&delayed->list);
 -                      bio_list_add(&flush_bios, delayed->bio);
 -                      if ((bio_data_dir(delayed->bio) == WRITE))
 +                      bio_list_add(&flush_bios, bio);
 +                      if ((bio_data_dir(bio) == WRITE))
                                delayed->context->writes--;
                        else
                                delayed->context->reads--;
 -                      mempool_free(delayed, dc->delayed_pool);
                        continue;
                }
  
@@@ -182,6 -185,12 +182,6 @@@ static int delay_ctr(struct dm_target *
        }
  
  out:
 -      dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache);
 -      if (!dc->delayed_pool) {
 -              DMERR("Couldn't create delayed bio pool.");
 -              goto bad_dev_write;
 -      }
 -
        dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
        if (!dc->kdelayd_wq) {
                DMERR("Couldn't start kdelayd");
  
        ti->num_flush_bios = 1;
        ti->num_discard_bios = 1;
 +      ti->per_bio_data_size = sizeof(struct dm_delay_info);
        ti->private = dc;
        return 0;
  
  bad_queue:
 -      mempool_destroy(dc->delayed_pool);
 -bad_dev_write:
        if (dc->dev_write)
                dm_put_device(ti, dc->dev_write);
  bad_dev_read:
@@@ -222,6 -232,7 +222,6 @@@ static void delay_dtr(struct dm_target 
        if (dc->dev_write)
                dm_put_device(ti, dc->dev_write);
  
 -      mempool_destroy(dc->delayed_pool);
        kfree(dc);
  }
  
@@@ -233,9 -244,10 +233,9 @@@ static int delay_bio(struct delay_c *dc
        if (!delay || !atomic_read(&dc->may_delay))
                return 1;
  
 -      delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO);
 +      delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
  
        delayed->context = dc;
 -      delayed->bio = bio;
        delayed->expires = expires = jiffies + (delay * HZ / 1000);
  
        mutex_lock(&delayed_bios_lock);
@@@ -277,14 -289,15 +277,15 @@@ static int delay_map(struct dm_target *
        if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
                bio->bi_bdev = dc->dev_write->bdev;
                if (bio_sectors(bio))
-                       bio->bi_sector = dc->start_write +
-                                        dm_target_offset(ti, bio->bi_sector);
+                       bio->bi_iter.bi_sector = dc->start_write +
+                               dm_target_offset(ti, bio->bi_iter.bi_sector);
  
                return delay_bio(dc, dc->write_delay, bio);
        }
  
        bio->bi_bdev = dc->dev_read->bdev;
-       bio->bi_sector = dc->start_read + dm_target_offset(ti, bio->bi_sector);
+       bio->bi_iter.bi_sector = dc->start_read +
+               dm_target_offset(ti, bio->bi_iter.bi_sector);
  
        return delay_bio(dc, dc->read_delay, bio);
  }
@@@ -344,7 -357,13 +345,7 @@@ static struct target_type delay_target 
  
  static int __init dm_delay_init(void)
  {
 -      int r = -ENOMEM;
 -
 -      delayed_cache = KMEM_CACHE(dm_delay_info, 0);
 -      if (!delayed_cache) {
 -              DMERR("Couldn't create delayed bio cache.");
 -              goto bad_memcache;
 -      }
 +      int r;
  
        r = dm_register_target(&delay_target);
        if (r < 0) {
        return 0;
  
  bad_register:
 -      kmem_cache_destroy(delayed_cache);
 -bad_memcache:
        return r;
  }
  
  static void __exit dm_delay_exit(void)
  {
        dm_unregister_target(&delay_target);
 -      kmem_cache_destroy(delayed_cache);
  }
  
  /* Module hooks */
diff --combined drivers/md/dm-snap.c
index 717718558bd9908469b23bbb9b3cd0223ac243f3,01b6a11813f29e5ab32e3af144714ab28dc26089..ebddef5237e4b28e6254e486b3267dbccca9864e
@@@ -610,12 -610,12 +610,12 @@@ static struct dm_exception *dm_lookup_e
        return NULL;
  }
  
 -static struct dm_exception *alloc_completed_exception(void)
 +static struct dm_exception *alloc_completed_exception(gfp_t gfp)
  {
        struct dm_exception *e;
  
 -      e = kmem_cache_alloc(exception_cache, GFP_NOIO);
 -      if (!e)
 +      e = kmem_cache_alloc(exception_cache, gfp);
 +      if (!e && gfp == GFP_NOIO)
                e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
  
        return e;
@@@ -697,7 -697,7 +697,7 @@@ static int dm_add_exception(void *conte
        struct dm_snapshot *s = context;
        struct dm_exception *e;
  
 -      e = alloc_completed_exception();
 +      e = alloc_completed_exception(GFP_KERNEL);
        if (!e)
                return -ENOMEM;
  
@@@ -1405,7 -1405,7 +1405,7 @@@ static void pending_complete(struct dm_
                goto out;
        }
  
 -      e = alloc_completed_exception();
 +      e = alloc_completed_exception(GFP_NOIO);
        if (!e) {
                down_write(&s->lock);
                __invalidate_snapshot(s, -ENOMEM);
@@@ -1438,6 -1438,7 +1438,7 @@@ out
        if (full_bio) {
                full_bio->bi_end_io = pe->full_bio_end_io;
                full_bio->bi_private = pe->full_bio_private;
+               atomic_inc(&full_bio->bi_remaining);
        }
        free_pending_exception(pe);
  
@@@ -1619,11 -1620,10 +1620,10 @@@ static void remap_exception(struct dm_s
                            struct bio *bio, chunk_t chunk)
  {
        bio->bi_bdev = s->cow->bdev;
-       bio->bi_sector = chunk_to_sector(s->store,
-                                        dm_chunk_number(e->new_chunk) +
-                                        (chunk - e->old_chunk)) +
-                                        (bio->bi_sector &
-                                         s->store->chunk_mask);
+       bio->bi_iter.bi_sector =
+               chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) +
+                               (chunk - e->old_chunk)) +
+               (bio->bi_iter.bi_sector & s->store->chunk_mask);
  }
  
  static int snapshot_map(struct dm_target *ti, struct bio *bio)
                return DM_MAPIO_REMAPPED;
        }
  
-       chunk = sector_to_chunk(s->store, bio->bi_sector);
+       chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
  
        /* Full snapshots are not usable */
        /* To get here the table must be live so s->active is always set. */
                r = DM_MAPIO_SUBMITTED;
  
                if (!pe->started &&
-                   bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) {
+                   bio->bi_iter.bi_size ==
+                   (s->store->chunk_size << SECTOR_SHIFT)) {
                        pe->started = 1;
                        up_write(&s->lock);
                        start_full_bio(pe, bio);
@@@ -1758,7 -1759,7 +1759,7 @@@ static int snapshot_merge_map(struct dm
                return DM_MAPIO_REMAPPED;
        }
  
-       chunk = sector_to_chunk(s->store, bio->bi_sector);
+       chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
  
        down_write(&s->lock);
  
@@@ -2095,7 -2096,7 +2096,7 @@@ static int do_origin(struct dm_dev *ori
        down_read(&_origins_lock);
        o = __lookup_origin(origin->bdev);
        if (o)
-               r = __origin_write(&o->snapshots, bio->bi_sector, bio);
+               r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
        up_read(&_origins_lock);
  
        return r;
diff --combined drivers/md/dm-thin.c
index 726228b33a012f9994fc2f8843b25a0ca46ef966,357eb272dbd9d3337bf5d1a13db443261eb2b24b..faaf944597ab7669b90f3ecb85152fbcd16cbe33
@@@ -144,7 -144,6 +144,7 @@@ struct pool_features 
        bool zero_new_blocks:1;
        bool discard_enabled:1;
        bool discard_passdown:1;
 +      bool error_if_no_space:1;
  };
  
  struct thin_c;
@@@ -164,7 -163,8 +164,7 @@@ struct pool 
        int sectors_per_block_shift;
  
        struct pool_features pf;
 -      unsigned low_water_triggered:1; /* A dm event has been sent */
 -      unsigned no_free_space:1;       /* A -ENOSPC warning has been issued */
 +      bool low_water_triggered:1;     /* A dm event has been sent */
  
        struct dm_bio_prison *prison;
        struct dm_kcopyd_client *copier;
  };
  
  static enum pool_mode get_pool_mode(struct pool *pool);
 -static void set_pool_mode(struct pool *pool, enum pool_mode mode);
 +static void out_of_data_space(struct pool *pool);
 +static void metadata_operation_failed(struct pool *pool, const char *op, int r);
  
  /*
   * Target context for a pool.
@@@ -414,7 -413,7 +414,7 @@@ static bool block_size_is_power_of_two(
  static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
  {
        struct pool *pool = tc->pool;
-       sector_t block_nr = bio->bi_sector;
+       sector_t block_nr = bio->bi_iter.bi_sector;
  
        if (block_size_is_power_of_two(pool))
                block_nr >>= pool->sectors_per_block_shift;
  static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
  {
        struct pool *pool = tc->pool;
-       sector_t bi_sector = bio->bi_sector;
+       sector_t bi_sector = bio->bi_iter.bi_sector;
  
        bio->bi_bdev = tc->pool_dev->bdev;
        if (block_size_is_power_of_two(pool))
-               bio->bi_sector = (block << pool->sectors_per_block_shift) |
-                               (bi_sector & (pool->sectors_per_block - 1));
+               bio->bi_iter.bi_sector =
+                       (block << pool->sectors_per_block_shift) |
+                       (bi_sector & (pool->sectors_per_block - 1));
        else
-               bio->bi_sector = (block * pool->sectors_per_block) +
+               bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
                                 sector_div(bi_sector, pool->sectors_per_block);
  }
  
@@@ -510,16 -510,15 +511,16 @@@ static void remap_and_issue(struct thin
  struct dm_thin_new_mapping {
        struct list_head list;
  
 -      unsigned quiesced:1;
 -      unsigned prepared:1;
 -      unsigned pass_discard:1;
 +      bool quiesced:1;
 +      bool prepared:1;
 +      bool pass_discard:1;
 +      bool definitely_not_shared:1;
  
 +      int err;
        struct thin_c *tc;
        dm_block_t virt_block;
        dm_block_t data_block;
        struct dm_bio_prison_cell *cell, *cell2;
 -      int err;
  
        /*
         * If the bio covers the whole area of a block then we can avoid
@@@ -536,7 -535,7 +537,7 @@@ static void __maybe_add_mapping(struct 
        struct pool *pool = m->tc->pool;
  
        if (m->quiesced && m->prepared) {
 -              list_add(&m->list, &pool->prepared_mappings);
 +              list_add_tail(&m->list, &pool->prepared_mappings);
                wake_worker(pool);
        }
  }
@@@ -550,7 -549,7 +551,7 @@@ static void copy_complete(int read_err
        m->err = read_err || write_err ? -EIO : 0;
  
        spin_lock_irqsave(&pool->lock, flags);
 -      m->prepared = 1;
 +      m->prepared = true;
        __maybe_add_mapping(m);
        spin_unlock_irqrestore(&pool->lock, flags);
  }
@@@ -565,7 -564,7 +566,7 @@@ static void overwrite_endio(struct bio 
        m->err = err;
  
        spin_lock_irqsave(&pool->lock, flags);
 -      m->prepared = 1;
 +      m->prepared = true;
        __maybe_add_mapping(m);
        spin_unlock_irqrestore(&pool->lock, flags);
  }
@@@ -612,8 -611,10 +613,10 @@@ static void cell_defer_no_holder(struc
  
  static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
  {
-       if (m->bio)
+       if (m->bio) {
                m->bio->bi_end_io = m->saved_bi_end_io;
+               atomic_inc(&m->bio->bi_remaining);
+       }
        cell_error(m->tc->pool, m->cell);
        list_del(&m->list);
        mempool_free(m, m->tc->pool->mapping_pool);
@@@ -627,8 -628,10 +630,10 @@@ static void process_prepared_mapping(st
        int r;
  
        bio = m->bio;
-       if (bio)
+       if (bio) {
                bio->bi_end_io = m->saved_bi_end_io;
+               atomic_inc(&bio->bi_remaining);
+       }
  
        if (m->err) {
                cell_error(pool, m->cell);
         */
        r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
        if (r) {
 -              DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d",
 -                          dm_device_name(pool->pool_md), r);
 -              set_pool_mode(pool, PM_READ_ONLY);
 +              metadata_operation_failed(pool, "dm_thin_insert_block", r);
                cell_error(pool, m->cell);
                goto out;
        }
@@@ -683,15 -688,7 +688,15 @@@ static void process_prepared_discard_pa
        cell_defer_no_holder(tc, m->cell2);
  
        if (m->pass_discard)
 -              remap_and_issue(tc, m->bio, m->data_block);
 +              if (m->definitely_not_shared)
 +                      remap_and_issue(tc, m->bio, m->data_block);
 +              else {
 +                      bool used = false;
 +                      if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
 +                              bio_endio(m->bio, 0);
 +                      else
 +                              remap_and_issue(tc, m->bio, m->data_block);
 +              }
        else
                bio_endio(m->bio, 0);
  
@@@ -731,7 -728,8 +736,8 @@@ static void process_prepared(struct poo
   */
  static int io_overlaps_block(struct pool *pool, struct bio *bio)
  {
-       return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
+       return bio->bi_iter.bi_size ==
+               (pool->sectors_per_block << SECTOR_SHIFT);
  }
  
  static int io_overwrites_block(struct pool *pool, struct bio *bio)
@@@ -759,17 -757,13 +765,17 @@@ static int ensure_next_mapping(struct p
  
  static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
  {
 -      struct dm_thin_new_mapping *r = pool->next_mapping;
 +      struct dm_thin_new_mapping *m = pool->next_mapping;
  
        BUG_ON(!pool->next_mapping);
  
 +      memset(m, 0, sizeof(struct dm_thin_new_mapping));
 +      INIT_LIST_HEAD(&m->list);
 +      m->bio = NULL;
 +
        pool->next_mapping = NULL;
  
 -      return r;
 +      return m;
  }
  
  static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
        struct pool *pool = tc->pool;
        struct dm_thin_new_mapping *m = get_next_mapping(pool);
  
 -      INIT_LIST_HEAD(&m->list);
 -      m->quiesced = 0;
 -      m->prepared = 0;
        m->tc = tc;
        m->virt_block = virt_block;
        m->data_block = data_dest;
        m->cell = cell;
 -      m->err = 0;
 -      m->bio = NULL;
  
        if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
 -              m->quiesced = 1;
 +              m->quiesced = true;
  
        /*
         * IO to pool_dev remaps to the pool target's data_dev.
@@@ -847,12 -846,15 +853,12 @@@ static void schedule_zero(struct thin_
        struct pool *pool = tc->pool;
        struct dm_thin_new_mapping *m = get_next_mapping(pool);
  
 -      INIT_LIST_HEAD(&m->list);
 -      m->quiesced = 1;
 -      m->prepared = 0;
 +      m->quiesced = true;
 +      m->prepared = false;
        m->tc = tc;
        m->virt_block = virt_block;
        m->data_block = data_block;
        m->cell = cell;
 -      m->err = 0;
 -      m->bio = NULL;
  
        /*
         * If the whole block of data is being overwritten or we are not
@@@ -899,42 -901,41 +905,42 @@@ static int commit(struct pool *pool
                return -EINVAL;
  
        r = dm_pool_commit_metadata(pool->pmd);
 -      if (r) {
 -              DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d",
 -                          dm_device_name(pool->pool_md), r);
 -              set_pool_mode(pool, PM_READ_ONLY);
 -      }
 +      if (r)
 +              metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
  
        return r;
  }
  
 -static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 +static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
  {
 -      int r;
 -      dm_block_t free_blocks;
        unsigned long flags;
 -      struct pool *pool = tc->pool;
 -
 -      /*
 -       * Once no_free_space is set we must not allow allocation to succeed.
 -       * Otherwise it is difficult to explain, debug, test and support.
 -       */
 -      if (pool->no_free_space)
 -              return -ENOSPC;
 -
 -      r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
 -      if (r)
 -              return r;
  
        if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
                DMWARN("%s: reached low water mark for data device: sending event.",
                       dm_device_name(pool->pool_md));
                spin_lock_irqsave(&pool->lock, flags);
 -              pool->low_water_triggered = 1;
 +              pool->low_water_triggered = true;
                spin_unlock_irqrestore(&pool->lock, flags);
                dm_table_event(pool->ti->table);
        }
 +}
 +
 +static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 +{
 +      int r;
 +      dm_block_t free_blocks;
 +      struct pool *pool = tc->pool;
 +
 +      if (get_pool_mode(pool) != PM_WRITE)
 +              return -EINVAL;
 +
 +      r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
 +      if (r) {
 +              metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
 +              return r;
 +      }
 +
 +      check_low_water_mark(pool, free_blocks);
  
        if (!free_blocks) {
                /*
                        return r;
  
                r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
 -              if (r)
 +              if (r) {
 +                      metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
                        return r;
 +              }
  
 -              /*
 -               * If we still have no space we set a flag to avoid
 -               * doing all this checking and return -ENOSPC.  This
 -               * flag serves as a latch that disallows allocations from
 -               * this pool until the admin takes action (e.g. resize or
 -               * table reload).
 -               */
                if (!free_blocks) {
 -                      DMWARN("%s: no free data space available.",
 -                             dm_device_name(pool->pool_md));
 -                      spin_lock_irqsave(&pool->lock, flags);
 -                      pool->no_free_space = 1;
 -                      spin_unlock_irqrestore(&pool->lock, flags);
 +                      out_of_data_space(pool);
                        return -ENOSPC;
                }
        }
  
        r = dm_pool_alloc_data_block(pool->pmd, result);
        if (r) {
 -              if (r == -ENOSPC &&
 -                  !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
 -                  !free_blocks) {
 -                      DMWARN("%s: no free metadata space available.",
 -                             dm_device_name(pool->pool_md));
 -                      set_pool_mode(pool, PM_READ_ONLY);
 -              }
 +              metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
                return r;
        }
  
@@@ -982,21 -998,7 +988,21 @@@ static void retry_on_resume(struct bio 
        spin_unlock_irqrestore(&pool->lock, flags);
  }
  
 -static void no_space(struct pool *pool, struct dm_bio_prison_cell *cell)
 +static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
 +{
 +      /*
 +       * When pool is read-only, no cell locking is needed because
 +       * nothing is changing.
 +       */
 +      WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY);
 +
 +      if (pool->pf.error_if_no_space)
 +              bio_io_error(bio);
 +      else
 +              retry_on_resume(bio);
 +}
 +
 +static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
  {
        struct bio *bio;
        struct bio_list bios;
        cell_release(pool, cell, &bios);
  
        while ((bio = bio_list_pop(&bios)))
 -              retry_on_resume(bio);
 +              handle_unserviceable_bio(pool, bio);
  }
  
  static void process_discard(struct thin_c *tc, struct bio *bio)
                         */
                        m = get_next_mapping(pool);
                        m->tc = tc;
 -                      m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
 +                      m->pass_discard = pool->pf.discard_passdown;
 +                      m->definitely_not_shared = !lookup_result.shared;
                        m->virt_block = block;
                        m->data_block = lookup_result.block;
                        m->cell = cell;
                        m->cell2 = cell2;
 -                      m->err = 0;
                        m->bio = bio;
  
                        if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
                                spin_lock_irqsave(&pool->lock, flags);
 -                              list_add(&m->list, &pool->prepared_discards);
 +                              list_add_tail(&m->list, &pool->prepared_discards);
                                spin_unlock_irqrestore(&pool->lock, flags);
                                wake_worker(pool);
                        }
@@@ -1109,12 -1111,13 +1115,12 @@@ static void break_sharing(struct thin_
                break;
  
        case -ENOSPC:
 -              no_space(pool, cell);
 +              retry_bios_on_resume(pool, cell);
                break;
  
        default:
                DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
                            __func__, r);
 -              set_pool_mode(pool, PM_READ_ONLY);
                cell_error(pool, cell);
                break;
        }
@@@ -1136,7 -1139,7 +1142,7 @@@ static void process_shared_bio(struct t
        if (bio_detain(pool, &key, bio, &cell))
                return;
  
-       if (bio_data_dir(bio) == WRITE && bio->bi_size)
+       if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size)
                break_sharing(tc, bio, block, &key, lookup_result, cell);
        else {
                struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@@ -1159,7 -1162,7 +1165,7 @@@ static void provision_block(struct thin
        /*
         * Remap empty bios (flushes) immediately, without provisioning.
         */
-       if (!bio->bi_size) {
+       if (!bio->bi_iter.bi_size) {
                inc_all_io_entry(pool, bio);
                cell_defer_no_holder(tc, cell);
  
                break;
  
        case -ENOSPC:
 -              no_space(pool, cell);
 +              retry_bios_on_resume(pool, cell);
                break;
  
        default:
                DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
                            __func__, r);
 -              set_pool_mode(pool, PM_READ_ONLY);
                cell_error(pool, cell);
                break;
        }
@@@ -1258,8 -1262,8 +1264,8 @@@ static void process_bio_read_only(struc
        r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
        switch (r) {
        case 0:
-               if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
+               if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size)
 -                      bio_io_error(bio);
 +                      handle_unserviceable_bio(tc->pool, bio);
                else {
                        inc_all_io_entry(tc->pool, bio);
                        remap_and_issue(tc, bio, lookup_result.block);
  
        case -ENODATA:
                if (rw != READ) {
 -                      bio_io_error(bio);
 +                      handle_unserviceable_bio(tc->pool, bio);
                        break;
                }
  
@@@ -1392,16 -1396,16 +1398,16 @@@ static enum pool_mode get_pool_mode(str
        return pool->pf.mode;
  }
  
 -static void set_pool_mode(struct pool *pool, enum pool_mode mode)
 +static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
  {
        int r;
 +      enum pool_mode old_mode = pool->pf.mode;
  
 -      pool->pf.mode = mode;
 -
 -      switch (mode) {
 +      switch (new_mode) {
        case PM_FAIL:
 -              DMERR("%s: switching pool to failure mode",
 -                    dm_device_name(pool->pool_md));
 +              if (old_mode != new_mode)
 +                      DMERR("%s: switching pool to failure mode",
 +                            dm_device_name(pool->pool_md));
                dm_pool_metadata_read_only(pool->pmd);
                pool->process_bio = process_bio_fail;
                pool->process_discard = process_bio_fail;
                break;
  
        case PM_READ_ONLY:
 -              DMERR("%s: switching pool to read-only mode",
 -                    dm_device_name(pool->pool_md));
 +              if (old_mode != new_mode)
 +                      DMERR("%s: switching pool to read-only mode",
 +                            dm_device_name(pool->pool_md));
                r = dm_pool_abort_metadata(pool->pmd);
                if (r) {
                        DMERR("%s: aborting transaction failed",
                              dm_device_name(pool->pool_md));
 -                      set_pool_mode(pool, PM_FAIL);
 +                      new_mode = PM_FAIL;
 +                      set_pool_mode(pool, new_mode);
                } else {
                        dm_pool_metadata_read_only(pool->pmd);
                        pool->process_bio = process_bio_read_only;
                break;
  
        case PM_WRITE:
 +              if (old_mode != new_mode)
 +                      DMINFO("%s: switching pool to write mode",
 +                             dm_device_name(pool->pool_md));
                dm_pool_metadata_read_write(pool->pmd);
                pool->process_bio = process_bio;
                pool->process_discard = process_discard;
                pool->process_prepared_discard = process_prepared_discard;
                break;
        }
 +
 +      pool->pf.mode = new_mode;
 +}
 +
 +/*
 + * Rather than calling set_pool_mode directly, use these which describe the
 + * reason for mode degradation.
 + */
 +static void out_of_data_space(struct pool *pool)
 +{
 +      DMERR_LIMIT("%s: no free data space available.",
 +                  dm_device_name(pool->pool_md));
 +      set_pool_mode(pool, PM_READ_ONLY);
 +}
 +
 +static void metadata_operation_failed(struct pool *pool, const char *op, int r)
 +{
 +      dm_block_t free_blocks;
 +
 +      DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
 +                  dm_device_name(pool->pool_md), op, r);
 +
 +      if (r == -ENOSPC &&
 +          !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
 +          !free_blocks)
 +              DMERR_LIMIT("%s: no free metadata space available.",
 +                          dm_device_name(pool->pool_md));
 +
 +      set_pool_mode(pool, PM_READ_ONLY);
  }
  
  /*----------------------------------------------------------------*/
@@@ -1574,9 -1544,9 +1580,9 @@@ static int thin_bio_map(struct dm_targe
                if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
                        /*
                         * This block isn't provisioned, and we have no way
 -                       * of doing so.  Just error it.
 +                       * of doing so.
                         */
 -                      bio_io_error(bio);
 +                      handle_unserviceable_bio(tc->pool, bio);
                        return DM_MAPIO_SUBMITTED;
                }
                /* fall through */
@@@ -1683,17 -1653,6 +1689,17 @@@ static int bind_control_target(struct p
        enum pool_mode old_mode = pool->pf.mode;
        enum pool_mode new_mode = pt->adjusted_pf.mode;
  
 +      /*
 +       * Don't change the pool's mode until set_pool_mode() below.
 +       * Otherwise the pool's process_* function pointers may
 +       * not match the desired pool mode.
 +       */
 +      pt->adjusted_pf.mode = old_mode;
 +
 +      pool->ti = ti;
 +      pool->pf = pt->adjusted_pf;
 +      pool->low_water_blocks = pt->low_water_blocks;
 +
        /*
         * If we were in PM_FAIL mode, rollback of metadata failed.  We're
         * not going to recover without a thin_repair.  So we never let the
        if (old_mode == PM_FAIL)
                new_mode = old_mode;
  
 -      pool->ti = ti;
 -      pool->low_water_blocks = pt->low_water_blocks;
 -      pool->pf = pt->adjusted_pf;
 -
        set_pool_mode(pool, new_mode);
  
        return 0;
@@@ -1725,7 -1688,6 +1731,7 @@@ static void pool_features_init(struct p
        pf->zero_new_blocks = true;
        pf->discard_enabled = true;
        pf->discard_passdown = true;
 +      pf->error_if_no_space = false;
  }
  
  static void __pool_destroy(struct pool *pool)
@@@ -1816,7 -1778,8 +1822,7 @@@ static struct pool *pool_create(struct 
        bio_list_init(&pool->deferred_flush_bios);
        INIT_LIST_HEAD(&pool->prepared_mappings);
        INIT_LIST_HEAD(&pool->prepared_discards);
 -      pool->low_water_triggered = 0;
 -      pool->no_free_space = 0;
 +      pool->low_water_triggered = false;
        bio_list_init(&pool->retry_on_resume_list);
  
        pool->shared_read_ds = dm_deferred_set_create();
@@@ -1941,7 -1904,7 +1947,7 @@@ static int parse_pool_features(struct d
        const char *arg_name;
  
        static struct dm_arg _args[] = {
 -              {0, 3, "Invalid number of pool feature arguments"},
 +              {0, 4, "Invalid number of pool feature arguments"},
        };
  
        /*
                else if (!strcasecmp(arg_name, "read_only"))
                        pf->mode = PM_READ_ONLY;
  
 +              else if (!strcasecmp(arg_name, "error_if_no_space"))
 +                      pf->error_if_no_space = true;
 +
                else {
                        ti->error = "Unrecognised pool feature requested";
                        r = -EINVAL;
@@@ -2043,8 -2003,6 +2049,8 @@@ static dm_block_t calc_metadata_thresho
   *         skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
   *         ignore_discard: disable discard
   *         no_discard_passdown: don't pass discards down to the data device
 + *         read_only: Don't allow any changes to be made to the pool metadata.
 + *         error_if_no_space: error IOs, instead of queueing, if no space.
   */
  static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
  {
@@@ -2240,13 -2198,11 +2246,13 @@@ static int maybe_resize_data_dev(struc
                return -EINVAL;
  
        } else if (data_size > sb_data_size) {
 +              if (sb_data_size)
 +                      DMINFO("%s: growing the data device from %llu to %llu blocks",
 +                             dm_device_name(pool->pool_md),
 +                             sb_data_size, (unsigned long long)data_size);
                r = dm_pool_resize_data_dev(pool->pmd, data_size);
                if (r) {
 -                      DMERR("%s: failed to resize data device",
 -                            dm_device_name(pool->pool_md));
 -                      set_pool_mode(pool, PM_READ_ONLY);
 +                      metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
                        return r;
                }
  
@@@ -2281,12 -2237,10 +2287,12 @@@ static int maybe_resize_metadata_dev(st
                return -EINVAL;
  
        } else if (metadata_dev_size > sb_metadata_dev_size) {
 +              DMINFO("%s: growing the metadata device from %llu to %llu blocks",
 +                     dm_device_name(pool->pool_md),
 +                     sb_metadata_dev_size, metadata_dev_size);
                r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
                if (r) {
 -                      DMERR("%s: failed to resize metadata device",
 -                            dm_device_name(pool->pool_md));
 +                      metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
                        return r;
                }
  
@@@ -2342,7 -2296,8 +2348,7 @@@ static void pool_resume(struct dm_targe
        unsigned long flags;
  
        spin_lock_irqsave(&pool->lock, flags);
 -      pool->low_water_triggered = 0;
 -      pool->no_free_space = 0;
 +      pool->low_water_triggered = false;
        __requeue_bios(pool);
        spin_unlock_irqrestore(&pool->lock, flags);
  
@@@ -2561,8 -2516,7 +2567,8 @@@ static void emit_flags(struct pool_feat
                       unsigned sz, unsigned maxlen)
  {
        unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
 -              !pf->discard_passdown + (pf->mode == PM_READ_ONLY);
 +              !pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
 +              pf->error_if_no_space;
        DMEMIT("%u ", count);
  
        if (!pf->zero_new_blocks)
  
        if (pf->mode == PM_READ_ONLY)
                DMEMIT("read_only ");
 +
 +      if (pf->error_if_no_space)
 +              DMEMIT("error_if_no_space ");
  }
  
  /*
@@@ -2673,16 -2624,11 +2679,16 @@@ static void pool_status(struct dm_targe
                        DMEMIT("rw ");
  
                if (!pool->pf.discard_enabled)
 -                      DMEMIT("ignore_discard");
 +                      DMEMIT("ignore_discard ");
                else if (pool->pf.discard_passdown)
 -                      DMEMIT("discard_passdown");
 +                      DMEMIT("discard_passdown ");
 +              else
 +                      DMEMIT("no_discard_passdown ");
 +
 +              if (pool->pf.error_if_no_space)
 +                      DMEMIT("error_if_no_space ");
                else
 -                      DMEMIT("no_discard_passdown");
 +                      DMEMIT("queue_if_no_space ");
  
                break;
  
@@@ -2781,7 -2727,7 +2787,7 @@@ static struct target_type pool_target 
        .name = "thin-pool",
        .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
                    DM_TARGET_IMMUTABLE,
 -      .version = {1, 9, 0},
 +      .version = {1, 10, 0},
        .module = THIS_MODULE,
        .ctr = pool_ctr,
        .dtr = pool_dtr,
@@@ -2939,7 -2885,7 +2945,7 @@@ out_unlock
  
  static int thin_map(struct dm_target *ti, struct bio *bio)
  {
-       bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
+       bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
  
        return thin_bio_map(ti, bio);
  }
@@@ -2959,7 -2905,7 +2965,7 @@@ static int thin_endio(struct dm_target 
                spin_lock_irqsave(&pool->lock, flags);
                list_for_each_entry_safe(m, tmp, &work, list) {
                        list_del(&m->list);
 -                      m->quiesced = 1;
 +                      m->quiesced = true;
                        __maybe_add_mapping(m);
                }
                spin_unlock_irqrestore(&pool->lock, flags);
                if (!list_empty(&work)) {
                        spin_lock_irqsave(&pool->lock, flags);
                        list_for_each_entry_safe(m, tmp, &work, list)
 -                              list_add(&m->list, &pool->prepared_discards);
 +                              list_add_tail(&m->list, &pool->prepared_discards);
                        spin_unlock_irqrestore(&pool->lock, flags);
                        wake_worker(pool);
                }
@@@ -3068,7 -3014,7 +3074,7 @@@ static int thin_iterate_devices(struct 
  
  static struct target_type thin_target = {
        .name = "thin",
 -      .version = {1, 9, 0},
 +      .version = {1, 10, 0},
        .module = THIS_MODULE,
        .ctr = thin_ctr,
        .dtr = thin_dtr,
diff --combined drivers/md/dm.c
index b49c7628424171f0622ed4446e5c4111b00ba418,44a2fa6814ce97cbd05d3e3e34c65672c6dbbebe..8c53b09b9a2c5a3050b22f4fba82af5563f1d59a
@@@ -200,8 -200,8 +200,8 @@@ struct mapped_device 
        /* forced geometry settings */
        struct hd_geometry geometry;
  
 -      /* sysfs handle */
 -      struct kobject kobj;
 +      /* kobject and completion */
 +      struct dm_kobject_holder kobj_holder;
  
        /* zero-length flush that will be cloned and submitted to targets */
        struct bio flush_bio;
@@@ -575,7 -575,7 +575,7 @@@ static void start_io_acct(struct dm_io 
                atomic_inc_return(&md->pending[rw]));
  
        if (unlikely(dm_stats_used(&md->stats)))
-               dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
+               dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
                                    bio_sectors(bio), false, 0, &io->stats_aux);
  }
  
@@@ -593,7 -593,7 +593,7 @@@ static void end_io_acct(struct dm_io *i
        part_stat_unlock();
  
        if (unlikely(dm_stats_used(&md->stats)))
-               dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
+               dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
                                    bio_sectors(bio), true, duration, &io->stats_aux);
  
        /*
@@@ -742,7 -742,7 +742,7 @@@ static void dec_pending(struct dm_io *i
                if (io_error == DM_ENDIO_REQUEUE)
                        return;
  
-               if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
+               if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) {
                        /*
                         * Preflush done for flush with data, reissue
                         * without REQ_FLUSH.
@@@ -797,7 -797,7 +797,7 @@@ static void end_clone_bio(struct bio *c
        struct dm_rq_clone_bio_info *info = clone->bi_private;
        struct dm_rq_target_io *tio = info->tio;
        struct bio *bio = info->orig;
-       unsigned int nr_bytes = info->orig->bi_size;
+       unsigned int nr_bytes = info->orig->bi_iter.bi_size;
  
        bio_put(clone);
  
@@@ -1128,7 -1128,7 +1128,7 @@@ static void __map_bio(struct dm_target_
         * this io.
         */
        atomic_inc(&tio->io->io_count);
-       sector = clone->bi_sector;
+       sector = clone->bi_iter.bi_sector;
        r = ti->type->map(ti, clone);
        if (r == DM_MAPIO_REMAPPED) {
                /* the bio has been remapped so dispatch it */
@@@ -1155,76 -1155,32 +1155,32 @@@ struct clone_info 
        struct dm_io *io;
        sector_t sector;
        sector_t sector_count;
-       unsigned short idx;
  };
  
  static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
  {
-       bio->bi_sector = sector;
-       bio->bi_size = to_bytes(len);
- }
- static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count)
- {
-       bio->bi_idx = idx;
-       bio->bi_vcnt = idx + bv_count;
-       bio->bi_flags &= ~(1 << BIO_SEG_VALID);
- }
- static void clone_bio_integrity(struct bio *bio, struct bio *clone,
-                               unsigned short idx, unsigned len, unsigned offset,
-                               unsigned trim)
- {
-       if (!bio_integrity(bio))
-               return;
-       bio_integrity_clone(clone, bio, GFP_NOIO);
-       if (trim)
-               bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len);
- }
- /*
-  * Creates a little bio that just does part of a bvec.
-  */
- static void clone_split_bio(struct dm_target_io *tio, struct bio *bio,
-                           sector_t sector, unsigned short idx,
-                           unsigned offset, unsigned len)
- {
-       struct bio *clone = &tio->clone;
-       struct bio_vec *bv = bio->bi_io_vec + idx;
-       *clone->bi_io_vec = *bv;
-       bio_setup_sector(clone, sector, len);
-       clone->bi_bdev = bio->bi_bdev;
-       clone->bi_rw = bio->bi_rw;
-       clone->bi_vcnt = 1;
-       clone->bi_io_vec->bv_offset = offset;
-       clone->bi_io_vec->bv_len = clone->bi_size;
-       clone->bi_flags |= 1 << BIO_CLONED;
-       clone_bio_integrity(bio, clone, idx, len, offset, 1);
+       bio->bi_iter.bi_sector = sector;
+       bio->bi_iter.bi_size = to_bytes(len);
  }
  
  /*
   * Creates a bio that consists of range of complete bvecs.
   */
  static void clone_bio(struct dm_target_io *tio, struct bio *bio,
-                     sector_t sector, unsigned short idx,
-                     unsigned short bv_count, unsigned len)
+                     sector_t sector, unsigned len)
  {
        struct bio *clone = &tio->clone;
-       unsigned trim = 0;
  
-       __bio_clone(clone, bio);
-       bio_setup_sector(clone, sector, len);
-       bio_setup_bv(clone, idx, bv_count);
+       __bio_clone_fast(clone, bio);
+       if (bio_integrity(bio))
+               bio_integrity_clone(clone, bio, GFP_NOIO);
  
-       if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
-               trim = 1;
-       clone_bio_integrity(bio, clone, idx, len, 0, trim);
+       bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
+       clone->bi_iter.bi_size = to_bytes(len);
+       if (bio_integrity(bio))
+               bio_integrity_trim(clone, 0, len);
  }
  
  static struct dm_target_io *alloc_tio(struct clone_info *ci,
@@@ -1257,7 -1213,7 +1213,7 @@@ static void __clone_and_map_simple_bio(
         * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
         * and discard, so no need for concern about wasted bvec allocations.
         */
-        __bio_clone(clone, ci->bio);
+        __bio_clone_fast(clone, ci->bio);
        if (len)
                bio_setup_sector(clone, ci->sector, len);
  
@@@ -1286,10 -1242,7 +1242,7 @@@ static int __send_empty_flush(struct cl
  }
  
  static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
-                                    sector_t sector, int nr_iovecs,
-                                    unsigned short idx, unsigned short bv_count,
-                                    unsigned offset, unsigned len,
-                                    unsigned split_bvec)
+                                    sector_t sector, unsigned len)
  {
        struct bio *bio = ci->bio;
        struct dm_target_io *tio;
                num_target_bios = ti->num_write_bios(ti, bio);
  
        for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
-               tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr);
-               if (split_bvec)
-                       clone_split_bio(tio, bio, sector, idx, offset, len);
-               else
-                       clone_bio(tio, bio, sector, idx, bv_count, len);
+               tio = alloc_tio(ci, ti, 0, target_bio_nr);
+               clone_bio(tio, bio, sector, len);
                __map_bio(tio);
        }
  }
@@@ -1378,60 -1328,6 +1328,6 @@@ static int __send_write_same(struct clo
        return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
  }
  
- /*
-  * Find maximum number of sectors / bvecs we can process with a single bio.
-  */
- static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx)
- {
-       struct bio *bio = ci->bio;
-       sector_t bv_len, total_len = 0;
-       for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) {
-               bv_len = to_sector(bio->bi_io_vec[*idx].bv_len);
-               if (bv_len > max)
-                       break;
-               max -= bv_len;
-               total_len += bv_len;
-       }
-       return total_len;
- }
- static int __split_bvec_across_targets(struct clone_info *ci,
-                                      struct dm_target *ti, sector_t max)
- {
-       struct bio *bio = ci->bio;
-       struct bio_vec *bv = bio->bi_io_vec + ci->idx;
-       sector_t remaining = to_sector(bv->bv_len);
-       unsigned offset = 0;
-       sector_t len;
-       do {
-               if (offset) {
-                       ti = dm_table_find_target(ci->map, ci->sector);
-                       if (!dm_target_is_valid(ti))
-                               return -EIO;
-                       max = max_io_len(ci->sector, ti);
-               }
-               len = min(remaining, max);
-               __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0,
-                                        bv->bv_offset + offset, len, 1);
-               ci->sector += len;
-               ci->sector_count -= len;
-               offset += to_bytes(len);
-       } while (remaining -= len);
-       ci->idx++;
-       return 0;
- }
  /*
   * Select the correct strategy for processing a non-flush bio.
   */
@@@ -1439,8 -1335,7 +1335,7 @@@ static int __split_and_process_non_flus
  {
        struct bio *bio = ci->bio;
        struct dm_target *ti;
-       sector_t len, max;
-       int idx;
+       unsigned len;
  
        if (unlikely(bio->bi_rw & REQ_DISCARD))
                return __send_discard(ci);
        if (!dm_target_is_valid(ti))
                return -EIO;
  
-       max = max_io_len(ci->sector, ti);
-       /*
-        * Optimise for the simple case where we can do all of
-        * the remaining io with a single clone.
-        */
-       if (ci->sector_count <= max) {
-               __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
-                                        ci->idx, bio->bi_vcnt - ci->idx, 0,
-                                        ci->sector_count, 0);
-               ci->sector_count = 0;
-               return 0;
-       }
+       len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
  
-       /*
-        * There are some bvecs that don't span targets.
-        * Do as many of these as possible.
-        */
-       if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
-               len = __len_within_target(ci, max, &idx);
+       __clone_and_map_data_bio(ci, ti, ci->sector, len);
  
-               __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
-                                        ci->idx, idx - ci->idx, 0, len, 0);
+       ci->sector += len;
+       ci->sector_count -= len;
  
-               ci->sector += len;
-               ci->sector_count -= len;
-               ci->idx = idx;
-               return 0;
-       }
-       /*
-        * Handle a bvec that must be split between two or more targets.
-        */
-       return __split_bvec_across_targets(ci, ti, max);
+       return 0;
  }
  
  /*
@@@ -1510,8 -1378,7 +1378,7 @@@ static void __split_and_process_bio(str
        ci.io->bio = bio;
        ci.io->md = md;
        spin_lock_init(&ci.io->endio_lock);
-       ci.sector = bio->bi_sector;
-       ci.idx = bio->bi_idx;
+       ci.sector = bio->bi_iter.bi_sector;
  
        start_io_acct(ci.io);
  
@@@ -2041,7 -1908,6 +1908,7 @@@ static struct mapped_device *alloc_dev(
        init_waitqueue_head(&md->wait);
        INIT_WORK(&md->work, dm_wq_work);
        init_waitqueue_head(&md->eventq);
 +      init_completion(&md->kobj_holder.completion);
  
        md->disk->major = _major;
        md->disk->first_minor = minor;
@@@ -2903,14 -2769,20 +2770,14 @@@ struct gendisk *dm_disk(struct mapped_d
  
  struct kobject *dm_kobject(struct mapped_device *md)
  {
 -      return &md->kobj;
 +      return &md->kobj_holder.kobj;
  }
  
 -/*
 - * struct mapped_device should not be exported outside of dm.c
 - * so use this check to verify that kobj is part of md structure
 - */
  struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
  {
        struct mapped_device *md;
  
 -      md = container_of(kobj, struct mapped_device, kobj);
 -      if (&md->kobj != kobj)
 -              return NULL;
 +      md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
  
        if (test_bit(DMF_FREEING, &md->flags) ||
            dm_deleting_md(md))
diff --combined drivers/md/md.c
index 40c531359a15af61ad9c3ba70506d1863085dffe,16d84e091e2d199222f3c58203a07f3b696249c9..4ad5cc4e63e8438ca3c32fea1f40f69ec71657fb
@@@ -393,7 -393,7 +393,7 @@@ static void md_submit_flush_data(struc
        struct mddev *mddev = container_of(ws, struct mddev, flush_work);
        struct bio *bio = mddev->flush_bio;
  
-       if (bio->bi_size == 0)
+       if (bio->bi_iter.bi_size == 0)
                /* an empty barrier - all done */
                bio_endio(bio, 0);
        else {
@@@ -754,7 -754,7 +754,7 @@@ void md_super_write(struct mddev *mddev
        struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
  
        bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
-       bio->bi_sector = sector;
+       bio->bi_iter.bi_sector = sector;
        bio_add_page(bio, page, size, 0);
        bio->bi_private = rdev;
        bio->bi_end_io = super_written;
@@@ -782,18 -782,16 +782,16 @@@ int sync_page_io(struct md_rdev *rdev, 
        struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
        int ret;
  
-       rw |= REQ_SYNC;
        bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
                rdev->meta_bdev : rdev->bdev;
        if (metadata_op)
-               bio->bi_sector = sector + rdev->sb_start;
+               bio->bi_iter.bi_sector = sector + rdev->sb_start;
        else if (rdev->mddev->reshape_position != MaxSector &&
                 (rdev->mddev->reshape_backwards ==
                  (sector >= rdev->mddev->reshape_position)))
-               bio->bi_sector = sector + rdev->new_data_offset;
+               bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
        else
-               bio->bi_sector = sector + rdev->data_offset;
+               bio->bi_iter.bi_sector = sector + rdev->data_offset;
        bio_add_page(bio, page, size, 0);
        submit_bio_wait(rw, bio);
  
@@@ -1077,7 -1075,6 +1075,7 @@@ static int super_90_validate(struct mdd
        rdev->raid_disk = -1;
        clear_bit(Faulty, &rdev->flags);
        clear_bit(In_sync, &rdev->flags);
 +      clear_bit(Bitmap_sync, &rdev->flags);
        clear_bit(WriteMostly, &rdev->flags);
  
        if (mddev->raid_disks == 0) {
                 */
                if (ev1 < mddev->bitmap->events_cleared)
                        return 0;
 +              if (ev1 < mddev->events)
 +                      set_bit(Bitmap_sync, &rdev->flags);
        } else {
                if (ev1 < mddev->events)
                        /* just a hot-add of a new device, leave raid_disk at -1 */
                            desc->raid_disk < mddev->raid_disks */) {
                        set_bit(In_sync, &rdev->flags);
                        rdev->raid_disk = desc->raid_disk;
 +                      rdev->saved_raid_disk = desc->raid_disk;
                } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
                        /* active but not in sync implies recovery up to
                         * reshape position.  We don't know exactly where
@@@ -1567,7 -1561,6 +1565,7 @@@ static int super_1_validate(struct mdde
        rdev->raid_disk = -1;
        clear_bit(Faulty, &rdev->flags);
        clear_bit(In_sync, &rdev->flags);
 +      clear_bit(Bitmap_sync, &rdev->flags);
        clear_bit(WriteMostly, &rdev->flags);
  
        if (mddev->raid_disks == 0) {
                 */
                if (ev1 < mddev->bitmap->events_cleared)
                        return 0;
 +              if (ev1 < mddev->events)
 +                      set_bit(Bitmap_sync, &rdev->flags);
        } else {
                if (ev1 < mddev->events)
                        /* just a hot-add of a new device, leave raid_disk at -1 */
                        set_bit(Faulty, &rdev->flags);
                        break;
                default:
 +                      rdev->saved_raid_disk = role;
                        if ((le32_to_cpu(sb->feature_map) &
 -                           MD_FEATURE_RECOVERY_OFFSET))
 +                           MD_FEATURE_RECOVERY_OFFSET)) {
                                rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
 -                      else
 +                              if (!(le32_to_cpu(sb->feature_map) &
 +                                    MD_FEATURE_RECOVERY_BITMAP))
 +                                      rdev->saved_raid_disk = -1;
 +                      } else
                                set_bit(In_sync, &rdev->flags);
                        rdev->raid_disk = role;
                        break;
@@@ -1741,9 -1728,6 +1739,9 @@@ static void super_1_sync(struct mddev *
                        cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
                sb->recovery_offset =
                        cpu_to_le64(rdev->recovery_offset);
 +              if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
 +                      sb->feature_map |=
 +                              cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
        }
        if (test_bit(Replacement, &rdev->flags))
                sb->feature_map |=
@@@ -2485,7 -2469,8 +2483,7 @@@ repeat
                if (rdev->sb_loaded != 1)
                        continue; /* no noise on spare devices */
  
 -              if (!test_bit(Faulty, &rdev->flags) &&
 -                  rdev->saved_raid_disk == -1) {
 +              if (!test_bit(Faulty, &rdev->flags)) {
                        md_super_write(mddev,rdev,
                                       rdev->sb_start, rdev->sb_size,
                                       rdev->sb_page);
                                rdev->badblocks.size = 0;
                        }
  
 -              } else if (test_bit(Faulty, &rdev->flags))
 +              } else
                        pr_debug("md: %s (skipping faulty)\n",
                                 bdevname(rdev->bdev, b));
 -              else
 -                      pr_debug("(skipping incremental s/r ");
  
                if (mddev->level == LEVEL_MULTIPATH)
                        /* only need to write one superblock... */
@@@ -2619,8 -2606,6 +2617,8 @@@ state_store(struct md_rdev *rdev, cons
         *  blocked - sets the Blocked flags
         *  -blocked - clears the Blocked and possibly simulates an error
         *  insync - sets Insync providing device isn't active
 +       *  -insync - clear Insync for a device with a slot assigned,
 +       *            so that it gets rebuilt based on bitmap
         *  write_error - sets WriteErrorSeen
         *  -write_error - clears WriteErrorSeen
         */
        } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
                set_bit(In_sync, &rdev->flags);
                err = 0;
 +      } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
 +              clear_bit(In_sync, &rdev->flags);
 +              rdev->saved_raid_disk = rdev->raid_disk;
 +              rdev->raid_disk = -1;
 +              err = 0;
        } else if (cmd_match(buf, "write_error")) {
                set_bit(WriteErrorSeen, &rdev->flags);
                err = 0;
@@@ -2806,7 -2786,6 +2804,7 @@@ slot_store(struct md_rdev *rdev, const 
                else
                        rdev->saved_raid_disk = -1;
                clear_bit(In_sync, &rdev->flags);
 +              clear_bit(Bitmap_sync, &rdev->flags);
                err = rdev->mddev->pers->
                        hot_add_disk(rdev->mddev, rdev);
                if (err) {
@@@ -3601,8 -3580,6 +3599,8 @@@ level_store(struct mddev *mddev, const 
        pers->run(mddev);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
        mddev_resume(mddev);
 +      if (!mddev->thread)
 +              md_update_sb(mddev, 1);
        sysfs_notify(&mddev->kobj, NULL, "level");
        md_new_event(mddev);
        return rv;
@@@ -5781,10 -5758,8 +5779,10 @@@ static int add_new_disk(struct mddev * 
                            info->raid_disk < mddev->raid_disks) {
                                rdev->raid_disk = info->raid_disk;
                                set_bit(In_sync, &rdev->flags);
 +                              clear_bit(Bitmap_sync, &rdev->flags);
                        } else
                                rdev->raid_disk = -1;
 +                      rdev->saved_raid_disk = rdev->raid_disk;
                } else
                        super_types[mddev->major_version].
                                validate_super(mddev, rdev);
                        return -EINVAL;
                }
  
 -              if (test_bit(In_sync, &rdev->flags))
 -                      rdev->saved_raid_disk = rdev->raid_disk;
 -              else
 -                      rdev->saved_raid_disk = -1;
 -
                clear_bit(In_sync, &rdev->flags); /* just to be sure */
                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
                        set_bit(WriteMostly, &rdev->flags);
@@@ -6346,32 -6326,6 +6344,32 @@@ static int md_getgeo(struct block_devic
        return 0;
  }
  
 +static inline bool md_ioctl_valid(unsigned int cmd)
 +{
 +      switch (cmd) {
 +      case ADD_NEW_DISK:
 +      case BLKROSET:
 +      case GET_ARRAY_INFO:
 +      case GET_BITMAP_FILE:
 +      case GET_DISK_INFO:
 +      case HOT_ADD_DISK:
 +      case HOT_REMOVE_DISK:
 +      case PRINT_RAID_DEBUG:
 +      case RAID_AUTORUN:
 +      case RAID_VERSION:
 +      case RESTART_ARRAY_RW:
 +      case RUN_ARRAY:
 +      case SET_ARRAY_INFO:
 +      case SET_BITMAP_FILE:
 +      case SET_DISK_FAULTY:
 +      case STOP_ARRAY:
 +      case STOP_ARRAY_RO:
 +              return true;
 +      default:
 +              return false;
 +      }
 +}
 +
  static int md_ioctl(struct block_device *bdev, fmode_t mode,
                        unsigned int cmd, unsigned long arg)
  {
        struct mddev *mddev = NULL;
        int ro;
  
 +      if (!md_ioctl_valid(cmd))
 +              return -ENOTTY;
 +
        switch (cmd) {
        case RAID_VERSION:
        case GET_ARRAY_INFO:
@@@ -7753,12 -7704,10 +7751,12 @@@ static int remove_and_add_spares(struc
                if (test_bit(Faulty, &rdev->flags))
                        continue;
                if (mddev->ro &&
 -                  rdev->saved_raid_disk < 0)
 +                  ! (rdev->saved_raid_disk >= 0 &&
 +                     !test_bit(Bitmap_sync, &rdev->flags)))
                        continue;
  
 -              rdev->recovery_offset = 0;
 +              if (rdev->saved_raid_disk < 0)
 +                      rdev->recovery_offset = 0;
                if (mddev->pers->
                    hot_add_disk(mddev, rdev) == 0) {
                        if (sysfs_link_rdev(mddev, rdev))
@@@ -7836,12 -7785,9 +7834,12 @@@ void md_check_recovery(struct mddev *md
                         * As we only add devices that are already in-sync,
                         * we can activate the spares immediately.
                         */
 -                      clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
                        remove_and_add_spares(mddev, NULL);
 -                      mddev->pers->spare_active(mddev);
 +                      /* There is no thread, but we need to call
 +                       * ->spare_active and clear saved_raid_disk
 +                       */
 +                      md_reap_sync_thread(mddev);
 +                      clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
                        goto unlock;
                }
  
@@@ -7978,10 -7924,14 +7976,10 @@@ void md_reap_sync_thread(struct mddev *
                mddev->pers->finish_reshape(mddev);
  
        /* If array is no-longer degraded, then any saved_raid_disk
 -       * information must be scrapped.  Also if any device is now
 -       * In_sync we must scrape the saved_raid_disk for that device
 -       * do the superblock for an incrementally recovered device
 -       * written out.
 +       * information must be scrapped.
         */
 -      rdev_for_each(rdev, mddev)
 -              if (!mddev->degraded ||
 -                  test_bit(In_sync, &rdev->flags))
 +      if (!mddev->degraded)
 +              rdev_for_each(rdev, mddev)
                        rdev->saved_raid_disk = -1;
  
        md_update_sb(mddev, 1);
diff --combined drivers/md/raid1.c
index a49cfcc7a343188a5579350886795ce6fef35c4f,db3b9d7314f1835def74642faaff39af892d1d8c..fd3a2a14b587da5e3bb5046b0017ed7bd46f67a1
@@@ -229,7 -229,7 +229,7 @@@ static void call_bio_endio(struct r1bi
        int done;
        struct r1conf *conf = r1_bio->mddev->private;
        sector_t start_next_window = r1_bio->start_next_window;
-       sector_t bi_sector = bio->bi_sector;
+       sector_t bi_sector = bio->bi_iter.bi_sector;
  
        if (bio->bi_phys_segments) {
                unsigned long flags;
@@@ -265,9 -265,8 +265,8 @@@ static void raid_end_bio_io(struct r1bi
        if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
                pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
                         (bio_data_dir(bio) == WRITE) ? "write" : "read",
-                        (unsigned long long) bio->bi_sector,
-                        (unsigned long long) bio->bi_sector +
-                        bio_sectors(bio) - 1);
+                        (unsigned long long) bio->bi_iter.bi_sector,
+                        (unsigned long long) bio_end_sector(bio) - 1);
  
                call_bio_endio(r1_bio);
        }
@@@ -466,9 -465,8 +465,8 @@@ static void raid1_end_write_request(str
                                struct bio *mbio = r1_bio->master_bio;
                                pr_debug("raid1: behind end write sectors"
                                         " %llu-%llu\n",
-                                        (unsigned long long) mbio->bi_sector,
-                                        (unsigned long long) mbio->bi_sector +
-                                        bio_sectors(mbio) - 1);
+                                        (unsigned long long) mbio->bi_iter.bi_sector,
+                                        (unsigned long long) bio_end_sector(mbio) - 1);
                                call_bio_endio(r1_bio);
                        }
                }
@@@ -875,7 -873,7 +873,7 @@@ static bool need_to_wait_for_sync(struc
                else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
                                >= bio_end_sector(bio)) ||
                         (conf->next_resync + NEXT_NORMALIO_DISTANCE
-                               <= bio->bi_sector))
+                               <= bio->bi_iter.bi_sector))
                        wait = false;
                else
                        wait = true;
@@@ -913,19 -911,20 +911,19 @@@ static sector_t wait_barrier(struct r1c
  
        if (bio && bio_data_dir(bio) == WRITE) {
                if (conf->next_resync + NEXT_NORMALIO_DISTANCE
-                   <= bio->bi_sector) {
+                   <= bio->bi_iter.bi_sector) {
                        if (conf->start_next_window == MaxSector)
                                conf->start_next_window =
                                        conf->next_resync +
                                        NEXT_NORMALIO_DISTANCE;
  
                        if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
-                           <= bio->bi_sector)
+                           <= bio->bi_iter.bi_sector)
                                conf->next_window_requests++;
                        else
                                conf->current_window_requests++;
 -              }
 -              if (bio->bi_iter.bi_sector >= conf->start_next_window)
                        sector = conf->start_next_window;
 +              }
        }
  
        conf->nr_pending++;
@@@ -1027,7 -1026,8 +1025,8 @@@ do_sync_io
                if (bvecs[i].bv_page)
                        put_page(bvecs[i].bv_page);
        kfree(bvecs);
-       pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
+       pr_debug("%dB behind alloc failed, doing sync I/O\n",
+                bio->bi_iter.bi_size);
  }
  
  struct raid1_plug_cb {
@@@ -1107,7 -1107,7 +1106,7 @@@ static void make_request(struct mddev *
  
        if (bio_data_dir(bio) == WRITE &&
            bio_end_sector(bio) > mddev->suspend_lo &&
-           bio->bi_sector < mddev->suspend_hi) {
+           bio->bi_iter.bi_sector < mddev->suspend_hi) {
                /* As the suspend_* range is controlled by
                 * userspace, we want an interruptible
                 * wait.
                        prepare_to_wait(&conf->wait_barrier,
                                        &w, TASK_INTERRUPTIBLE);
                        if (bio_end_sector(bio) <= mddev->suspend_lo ||
-                           bio->bi_sector >= mddev->suspend_hi)
+                           bio->bi_iter.bi_sector >= mddev->suspend_hi)
                                break;
                        schedule();
                }
        r1_bio->sectors = bio_sectors(bio);
        r1_bio->state = 0;
        r1_bio->mddev = mddev;
-       r1_bio->sector = bio->bi_sector;
+       r1_bio->sector = bio->bi_iter.bi_sector;
  
        /* We might need to issue multiple reads to different
         * devices if there are bad blocks around, so we keep
@@@ -1180,12 -1180,13 +1179,13 @@@ read_again
                r1_bio->read_disk = rdisk;
  
                read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-               bio_trim(read_bio, r1_bio->sector - bio->bi_sector,
+               bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
                         max_sectors);
  
                r1_bio->bios[rdisk] = read_bio;
  
-               read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
+               read_bio->bi_iter.bi_sector = r1_bio->sector +
+                       mirror->rdev->data_offset;
                read_bio->bi_bdev = mirror->rdev->bdev;
                read_bio->bi_end_io = raid1_end_read_request;
                read_bio->bi_rw = READ | do_sync;
                         */
  
                        sectors_handled = (r1_bio->sector + max_sectors
-                                          - bio->bi_sector);
+                                          - bio->bi_iter.bi_sector);
                        r1_bio->sectors = max_sectors;
                        spin_lock_irq(&conf->device_lock);
                        if (bio->bi_phys_segments == 0)
                        r1_bio->sectors = bio_sectors(bio) - sectors_handled;
                        r1_bio->state = 0;
                        r1_bio->mddev = mddev;
-                       r1_bio->sector = bio->bi_sector + sectors_handled;
+                       r1_bio->sector = bio->bi_iter.bi_sector +
+                               sectors_handled;
                        goto read_again;
                } else
                        generic_make_request(read_bio);
                        if (r1_bio->bios[j])
                                rdev_dec_pending(conf->mirrors[j].rdev, mddev);
                r1_bio->state = 0;
-               allow_barrier(conf, start_next_window, bio->bi_sector);
+               allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector);
                md_wait_for_blocked_rdev(blocked_rdev, mddev);
                start_next_window = wait_barrier(conf, bio);
                /*
                        bio->bi_phys_segments++;
                spin_unlock_irq(&conf->device_lock);
        }
-       sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
+       sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
  
        atomic_set(&r1_bio->remaining, 1);
        atomic_set(&r1_bio->behind_remaining, 0);
                        continue;
  
                mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-               bio_trim(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
+               bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors);
  
                if (first_clone) {
                        /* do behind I/O ?
  
                r1_bio->bios[i] = mbio;
  
-               mbio->bi_sector = (r1_bio->sector +
+               mbio->bi_iter.bi_sector = (r1_bio->sector +
                                   conf->mirrors[i].rdev->data_offset);
                mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
                mbio->bi_end_io = raid1_end_write_request;
                r1_bio->sectors = bio_sectors(bio) - sectors_handled;
                r1_bio->state = 0;
                r1_bio->mddev = mddev;
-               r1_bio->sector = bio->bi_sector + sectors_handled;
+               r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
                goto retry_write;
        }
  
@@@ -1958,14 -1960,14 +1959,14 @@@ static int process_checks(struct r1bio 
                /* fixup the bio for reuse */
                bio_reset(b);
                b->bi_vcnt = vcnt;
-               b->bi_size = r1_bio->sectors << 9;
-               b->bi_sector = r1_bio->sector +
+               b->bi_iter.bi_size = r1_bio->sectors << 9;
+               b->bi_iter.bi_sector = r1_bio->sector +
                        conf->mirrors[i].rdev->data_offset;
                b->bi_bdev = conf->mirrors[i].rdev->bdev;
                b->bi_end_io = end_sync_read;
                b->bi_private = r1_bio;
  
-               size = b->bi_size;
+               size = b->bi_iter.bi_size;
                for (j = 0; j < vcnt ; j++) {
                        struct bio_vec *bi;
                        bi = &b->bi_io_vec[j];
@@@ -2220,11 -2222,11 +2221,11 @@@ static int narrow_write_error(struct r1
                }
  
                wbio->bi_rw = WRITE;
-               wbio->bi_sector = r1_bio->sector;
-               wbio->bi_size = r1_bio->sectors << 9;
+               wbio->bi_iter.bi_sector = r1_bio->sector;
+               wbio->bi_iter.bi_size = r1_bio->sectors << 9;
  
                bio_trim(wbio, sector - r1_bio->sector, sectors);
-               wbio->bi_sector += rdev->data_offset;
+               wbio->bi_iter.bi_sector += rdev->data_offset;
                wbio->bi_bdev = rdev->bdev;
                if (submit_bio_wait(WRITE, wbio) == 0)
                        /* failure! */
@@@ -2338,7 -2340,8 +2339,8 @@@ read_more
                }
                r1_bio->read_disk = disk;
                bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
-               bio_trim(bio, r1_bio->sector - bio->bi_sector, max_sectors);
+               bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
+                        max_sectors);
                r1_bio->bios[r1_bio->read_disk] = bio;
                rdev = conf->mirrors[disk].rdev;
                printk_ratelimited(KERN_ERR
                                   mdname(mddev),
                                   (unsigned long long)r1_bio->sector,
                                   bdevname(rdev->bdev, b));
-               bio->bi_sector = r1_bio->sector + rdev->data_offset;
+               bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset;
                bio->bi_bdev = rdev->bdev;
                bio->bi_end_io = raid1_end_read_request;
                bio->bi_rw = READ | do_sync;
                        /* Drat - have to split this up more */
                        struct bio *mbio = r1_bio->master_bio;
                        int sectors_handled = (r1_bio->sector + max_sectors
-                                              - mbio->bi_sector);
+                                              - mbio->bi_iter.bi_sector);
                        r1_bio->sectors = max_sectors;
                        spin_lock_irq(&conf->device_lock);
                        if (mbio->bi_phys_segments == 0)
                        r1_bio->state = 0;
                        set_bit(R1BIO_ReadError, &r1_bio->state);
                        r1_bio->mddev = mddev;
-                       r1_bio->sector = mbio->bi_sector + sectors_handled;
+                       r1_bio->sector = mbio->bi_iter.bi_sector +
+                               sectors_handled;
  
                        goto read_more;
                } else
@@@ -2598,7 -2602,7 +2601,7 @@@ static sector_t sync_request(struct mdd
                }
                if (bio->bi_end_io) {
                        atomic_inc(&rdev->nr_pending);
-                       bio->bi_sector = sector_nr + rdev->data_offset;
+                       bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
                        bio->bi_bdev = rdev->bdev;
                        bio->bi_private = r1_bio;
                }
                                                        continue;
                                                /* remove last page from this bio */
                                                bio->bi_vcnt--;
-                                               bio->bi_size -= len;
+                                               bio->bi_iter.bi_size -= len;
                                                bio->bi_flags &= ~(1<< BIO_SEG_VALID);
                                        }
                                        goto bio_full;
diff --combined drivers/md/raid10.c
index 8d39d63281b9b5441b3ec8e524955356c8690871,6d43d88657aa81982e1f31540878e822ffd44a39..33fc408e5eacef0a1dce55fd5c0d578fc244b663
@@@ -1152,14 -1152,12 +1152,12 @@@ static void raid10_unplug(struct blk_pl
        kfree(plug);
  }
  
- static void make_request(struct mddev *mddev, struct bio * bio)
+ static void __make_request(struct mddev *mddev, struct bio *bio)
  {
        struct r10conf *conf = mddev->private;
        struct r10bio *r10_bio;
        struct bio *read_bio;
        int i;
-       sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
-       int chunk_sects = chunk_mask + 1;
        const int rw = bio_data_dir(bio);
        const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
        const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
        int max_sectors;
        int sectors;
  
-       if (unlikely(bio->bi_rw & REQ_FLUSH)) {
-               md_flush_request(mddev, bio);
-               return;
-       }
-       /* If this request crosses a chunk boundary, we need to
-        * split it.  This will only happen for 1 PAGE (or less) requests.
-        */
-       if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
-                    > chunk_sects
-                    && (conf->geo.near_copies < conf->geo.raid_disks
-                        || conf->prev.near_copies < conf->prev.raid_disks))) {
-               struct bio_pair *bp;
-               /* Sanity check -- queue functions should prevent this happening */
-               if (bio_segments(bio) > 1)
-                       goto bad_map;
-               /* This is a one page bio that upper layers
-                * refuse to split for us, so we need to split it.
-                */
-               bp = bio_split(bio,
-                              chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
-               /* Each of these 'make_request' calls will call 'wait_barrier'.
-                * If the first succeeds but the second blocks due to the resync
-                * thread raising the barrier, we will deadlock because the
-                * IO to the underlying device will be queued in generic_make_request
-                * and will never complete, so will never reduce nr_pending.
-                * So increment nr_waiting here so no new raise_barriers will
-                * succeed, and so the second wait_barrier cannot block.
-                */
-               spin_lock_irq(&conf->resync_lock);
-               conf->nr_waiting++;
-               spin_unlock_irq(&conf->resync_lock);
-               make_request(mddev, &bp->bio1);
-               make_request(mddev, &bp->bio2);
-               spin_lock_irq(&conf->resync_lock);
-               conf->nr_waiting--;
-               wake_up(&conf->wait_barrier);
-               spin_unlock_irq(&conf->resync_lock);
-               bio_pair_release(bp);
-               return;
-       bad_map:
-               printk("md/raid10:%s: make_request bug: can't convert block across chunks"
-                      " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
-                      (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
-               bio_io_error(bio);
-               return;
-       }
-       md_write_start(mddev, bio);
-       /*
-        * Register the new request and wait if the reconstruction
-        * thread has put up a bar for new requests.
-        * Continue immediately if no resync is active currently.
-        */
-       wait_barrier(conf);
        sectors = bio_sectors(bio);
        while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-           bio->bi_sector < conf->reshape_progress &&
-           bio->bi_sector + sectors > conf->reshape_progress) {
+           bio->bi_iter.bi_sector < conf->reshape_progress &&
+           bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
                /* IO spans the reshape position.  Need to wait for
                 * reshape to pass
                 */
                allow_barrier(conf);
                wait_event(conf->wait_barrier,
-                          conf->reshape_progress <= bio->bi_sector ||
-                          conf->reshape_progress >= bio->bi_sector + sectors);
+                          conf->reshape_progress <= bio->bi_iter.bi_sector ||
+                          conf->reshape_progress >= bio->bi_iter.bi_sector +
+                          sectors);
                wait_barrier(conf);
        }
        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
            bio_data_dir(bio) == WRITE &&
            (mddev->reshape_backwards
-            ? (bio->bi_sector < conf->reshape_safe &&
-               bio->bi_sector + sectors > conf->reshape_progress)
-            : (bio->bi_sector + sectors > conf->reshape_safe &&
-               bio->bi_sector < conf->reshape_progress))) {
+            ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
+               bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
+            : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
+               bio->bi_iter.bi_sector < conf->reshape_progress))) {
                /* Need to update reshape_position in metadata */
                mddev->reshape_position = conf->reshape_progress;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
        r10_bio->sectors = sectors;
  
        r10_bio->mddev = mddev;
-       r10_bio->sector = bio->bi_sector;
+       r10_bio->sector = bio->bi_iter.bi_sector;
        r10_bio->state = 0;
  
        /* We might need to issue multiple reads to different
@@@ -1302,13 -1239,13 +1239,13 @@@ read_again
                slot = r10_bio->read_slot;
  
                read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-               bio_trim(read_bio, r10_bio->sector - bio->bi_sector,
+               bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
                         max_sectors);
  
                r10_bio->devs[slot].bio = read_bio;
                r10_bio->devs[slot].rdev = rdev;
  
-               read_bio->bi_sector = r10_bio->devs[slot].addr +
+               read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
                        choose_data_offset(r10_bio, rdev);
                read_bio->bi_bdev = rdev->bdev;
                read_bio->bi_end_io = raid10_end_read_request;
                        /* Could not read all from this device, so we will
                         * need another r10_bio.
                         */
 -                      sectors_handled = (r10_bio->sectors + max_sectors
 +                      sectors_handled = (r10_bio->sector + max_sectors
-                                          - bio->bi_sector);
+                                          - bio->bi_iter.bi_sector);
                        r10_bio->sectors = max_sectors;
                        spin_lock_irq(&conf->device_lock);
                        if (bio->bi_phys_segments == 0)
                                bio->bi_phys_segments = 2;
                        else
                                bio->bi_phys_segments++;
 -                      spin_unlock(&conf->device_lock);
 +                      spin_unlock_irq(&conf->device_lock);
                        /* Cannot call generic_make_request directly
                         * as that will be queued in __generic_make_request
                         * and subsequent mempool_alloc might block
                        r10_bio->sectors = bio_sectors(bio) - sectors_handled;
                        r10_bio->state = 0;
                        r10_bio->mddev = mddev;
-                       r10_bio->sector = bio->bi_sector + sectors_handled;
+                       r10_bio->sector = bio->bi_iter.bi_sector +
+                               sectors_handled;
                        goto read_again;
                } else
                        generic_make_request(read_bio);
@@@ -1499,7 -1437,8 +1437,8 @@@ retry_write
                        bio->bi_phys_segments++;
                spin_unlock_irq(&conf->device_lock);
        }
-       sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
+       sectors_handled = r10_bio->sector + max_sectors -
+               bio->bi_iter.bi_sector;
  
        atomic_set(&r10_bio->remaining, 1);
        bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
                if (r10_bio->devs[i].bio) {
                        struct md_rdev *rdev = conf->mirrors[d].rdev;
                        mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-                       bio_trim(mbio, r10_bio->sector - bio->bi_sector,
+                       bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
                                 max_sectors);
                        r10_bio->devs[i].bio = mbio;
  
-                       mbio->bi_sector = (r10_bio->devs[i].addr+
+                       mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
                                           choose_data_offset(r10_bio,
                                                              rdev));
                        mbio->bi_bdev = rdev->bdev;
                                rdev = conf->mirrors[d].rdev;
                        }
                        mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-                       bio_trim(mbio, r10_bio->sector - bio->bi_sector,
+                       bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
                                 max_sectors);
                        r10_bio->devs[i].repl_bio = mbio;
  
-                       mbio->bi_sector = (r10_bio->devs[i].addr +
+                       mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr +
                                           choose_data_offset(
                                                   r10_bio, rdev));
                        mbio->bi_bdev = rdev->bdev;
                r10_bio->sectors = bio_sectors(bio) - sectors_handled;
  
                r10_bio->mddev = mddev;
-               r10_bio->sector = bio->bi_sector + sectors_handled;
+               r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
                r10_bio->state = 0;
                goto retry_write;
        }
        one_write_done(r10_bio);
+ }
+ static void make_request(struct mddev *mddev, struct bio *bio)
+ {
+       struct r10conf *conf = mddev->private;
+       sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
+       int chunk_sects = chunk_mask + 1;
+       struct bio *split;
+       if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+               md_flush_request(mddev, bio);
+               return;
+       }
+       md_write_start(mddev, bio);
+       /*
+        * Register the new request and wait if the reconstruction
+        * thread has put up a bar for new requests.
+        * Continue immediately if no resync is active currently.
+        */
+       wait_barrier(conf);
+       do {
+               /*
+                * If this request crosses a chunk boundary, we need to split
+                * it.
+                */
+               if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
+                            bio_sectors(bio) > chunk_sects
+                            && (conf->geo.near_copies < conf->geo.raid_disks
+                                || conf->prev.near_copies <
+                                conf->prev.raid_disks))) {
+                       split = bio_split(bio, chunk_sects -
+                                         (bio->bi_iter.bi_sector &
+                                          (chunk_sects - 1)),
+                                         GFP_NOIO, fs_bio_set);
+                       bio_chain(split, bio);
+               } else {
+                       split = bio;
+               }
+               __make_request(mddev, split);
+       } while (split != bio);
  
        /* In case raid10d snuck in to freeze_array */
        wake_up(&conf->wait_barrier);
@@@ -2124,10 -2109,10 +2109,10 @@@ static void sync_request_write(struct m
                bio_reset(tbio);
  
                tbio->bi_vcnt = vcnt;
-               tbio->bi_size = r10_bio->sectors << 9;
+               tbio->bi_iter.bi_size = r10_bio->sectors << 9;
                tbio->bi_rw = WRITE;
                tbio->bi_private = r10_bio;
-               tbio->bi_sector = r10_bio->devs[i].addr;
+               tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
  
                for (j=0; j < vcnt ; j++) {
                        tbio->bi_io_vec[j].bv_offset = 0;
                atomic_inc(&r10_bio->remaining);
                md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
  
-               tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
+               tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
                tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
                generic_make_request(tbio);
        }
@@@ -2614,8 -2599,8 +2599,8 @@@ static int narrow_write_error(struct r1
                        sectors = sect_to_write;
                /* Write at 'sector' for 'sectors' */
                wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-               bio_trim(wbio, sector - bio->bi_sector, sectors);
-               wbio->bi_sector = (r10_bio->devs[i].addr+
+               bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
+               wbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
                                   choose_data_offset(r10_bio, rdev) +
                                   (sector - r10_bio->sector));
                wbio->bi_bdev = rdev->bdev;
@@@ -2687,10 -2672,10 +2672,10 @@@ read_more
                (unsigned long long)r10_bio->sector);
        bio = bio_clone_mddev(r10_bio->master_bio,
                              GFP_NOIO, mddev);
-       bio_trim(bio, r10_bio->sector - bio->bi_sector, max_sectors);
+       bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
        r10_bio->devs[slot].bio = bio;
        r10_bio->devs[slot].rdev = rdev;
-       bio->bi_sector = r10_bio->devs[slot].addr
+       bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
                + choose_data_offset(r10_bio, rdev);
        bio->bi_bdev = rdev->bdev;
        bio->bi_rw = READ | do_sync;
                struct bio *mbio = r10_bio->master_bio;
                int sectors_handled =
                        r10_bio->sector + max_sectors
-                       - mbio->bi_sector;
+                       - mbio->bi_iter.bi_sector;
                r10_bio->sectors = max_sectors;
                spin_lock_irq(&conf->device_lock);
                if (mbio->bi_phys_segments == 0)
                set_bit(R10BIO_ReadError,
                        &r10_bio->state);
                r10_bio->mddev = mddev;
-               r10_bio->sector = mbio->bi_sector
+               r10_bio->sector = mbio->bi_iter.bi_sector
                        + sectors_handled;
  
                goto read_more;
@@@ -3157,7 -3142,8 +3142,8 @@@ static sector_t sync_request(struct mdd
                                bio->bi_end_io = end_sync_read;
                                bio->bi_rw = READ;
                                from_addr = r10_bio->devs[j].addr;
-                               bio->bi_sector = from_addr + rdev->data_offset;
+                               bio->bi_iter.bi_sector = from_addr +
+                                       rdev->data_offset;
                                bio->bi_bdev = rdev->bdev;
                                atomic_inc(&rdev->nr_pending);
                                /* and we write to 'i' (if not in_sync) */
                                        bio->bi_private = r10_bio;
                                        bio->bi_end_io = end_sync_write;
                                        bio->bi_rw = WRITE;
-                                       bio->bi_sector = to_addr
+                                       bio->bi_iter.bi_sector = to_addr
                                                + rdev->data_offset;
                                        bio->bi_bdev = rdev->bdev;
                                        atomic_inc(&r10_bio->remaining);
                                bio->bi_private = r10_bio;
                                bio->bi_end_io = end_sync_write;
                                bio->bi_rw = WRITE;
-                               bio->bi_sector = to_addr + rdev->data_offset;
+                               bio->bi_iter.bi_sector = to_addr +
+                                       rdev->data_offset;
                                bio->bi_bdev = rdev->bdev;
                                atomic_inc(&r10_bio->remaining);
                                break;
                        if (j == conf->copies) {
                                /* Cannot recover, so abort the recovery or
                                 * record a bad block */
 -                              put_buf(r10_bio);
 -                              if (rb2)
 -                                      atomic_dec(&rb2->remaining);
 -                              r10_bio = rb2;
                                if (any_working) {
                                        /* problem is that there are bad blocks
                                         * on other device(s)
                                        mirror->recovery_disabled
                                                = mddev->recovery_disabled;
                                }
 +                              put_buf(r10_bio);
 +                              if (rb2)
 +                                      atomic_dec(&rb2->remaining);
 +                              r10_bio = rb2;
                                break;
                        }
                }
                        bio->bi_private = r10_bio;
                        bio->bi_end_io = end_sync_read;
                        bio->bi_rw = READ;
-                       bio->bi_sector = sector +
+                       bio->bi_iter.bi_sector = sector +
                                conf->mirrors[d].rdev->data_offset;
                        bio->bi_bdev = conf->mirrors[d].rdev->bdev;
                        count++;
                        bio->bi_private = r10_bio;
                        bio->bi_end_io = end_sync_write;
                        bio->bi_rw = WRITE;
-                       bio->bi_sector = sector +
+                       bio->bi_iter.bi_sector = sector +
                                conf->mirrors[d].replacement->data_offset;
                        bio->bi_bdev = conf->mirrors[d].replacement->bdev;
                        count++;
                             bio2 = bio2->bi_next) {
                                /* remove last page from this bio */
                                bio2->bi_vcnt--;
-                               bio2->bi_size -= len;
+                               bio2->bi_iter.bi_size -= len;
                                bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
                        }
                        goto bio_full;
@@@ -3747,8 -3734,7 +3734,8 @@@ static int run(struct mddev *mddev
                    !test_bit(In_sync, &disk->rdev->flags)) {
                        disk->head_position = 0;
                        mddev->degraded++;
 -                      if (disk->rdev)
 +                      if (disk->rdev &&
 +                          disk->rdev->saved_raid_disk < 0)
                                conf->fullsync = 1;
                }
                disk->recovery_disabled = mddev->recovery_disabled - 1;
@@@ -4418,7 -4404,7 +4405,7 @@@ read_more
        read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
  
        read_bio->bi_bdev = rdev->bdev;
-       read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
+       read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
                               + rdev->data_offset);
        read_bio->bi_private = r10_bio;
        read_bio->bi_end_io = end_sync_read;
        read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
        read_bio->bi_flags |= 1 << BIO_UPTODATE;
        read_bio->bi_vcnt = 0;
-       read_bio->bi_size = 0;
+       read_bio->bi_iter.bi_size = 0;
        r10_bio->master_bio = read_bio;
        r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
  
  
                bio_reset(b);
                b->bi_bdev = rdev2->bdev;
-               b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
+               b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
+                       rdev2->new_data_offset;
                b->bi_private = r10_bio;
                b->bi_end_io = end_reshape_write;
                b->bi_rw = WRITE;
                             bio2 = bio2->bi_next) {
                                /* Remove last page from this bio */
                                bio2->bi_vcnt--;
-                               bio2->bi_size -= len;
+                               bio2->bi_iter.bi_size -= len;
                                bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
                        }
                        goto bio_full;
diff --combined drivers/md/raid5.c
index 03f82ab87d9e73eb4fed4ede052c95fa5d891f09,eea63372e4d30533b2255159c8b428b2ad90acb3..67ca9c3d2939c5e4468d51f0ea0454dfdceac731
@@@ -133,7 -133,7 +133,7 @@@ static inline void unlock_all_device_ha
  static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
  {
        int sectors = bio_sectors(bio);
-       if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
+       if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
                return bio->bi_next;
        else
                return NULL;
@@@ -225,7 -225,7 +225,7 @@@ static void return_io(struct bio *retur
  
                return_bi = bi->bi_next;
                bi->bi_next = NULL;
-               bi->bi_size = 0;
+               bi->bi_iter.bi_size = 0;
                trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
                                         bi, 0);
                bio_endio(bi, 0);
@@@ -675,10 -675,8 +675,10 @@@ get_active_stripe(struct r5conf *conf, 
                                         || !conf->inactive_blocked),
                                        *(conf->hash_locks + hash));
                                conf->inactive_blocked = 0;
 -                      } else
 +                      } else {
                                init_stripe(sh, sector, previous);
 +                              atomic_inc(&sh->count);
 +                      }
                } else {
                        spin_lock(&conf->device_lock);
                        if (atomic_read(&sh->count)) {
                        } else {
                                if (!test_bit(STRIPE_HANDLE, &sh->state))
                                        atomic_inc(&conf->active_stripes);
 -                              BUG_ON(list_empty(&sh->lru));
 +                              BUG_ON(list_empty(&sh->lru) &&
 +                                     !test_bit(STRIPE_EXPANDING, &sh->state));
                                list_del_init(&sh->lru);
                                if (sh->group) {
                                        sh->group->stripes_cnt--;
                                        sh->group = NULL;
                                }
                        }
 +                      atomic_inc(&sh->count);
                        spin_unlock(&conf->device_lock);
                }
        } while (sh == NULL);
  
 -      if (sh)
 -              atomic_inc(&sh->count);
 -
        spin_unlock_irq(conf->hash_locks + hash);
        return sh;
  }
@@@ -852,10 -851,10 +852,10 @@@ static void ops_run_io(struct stripe_he
                                bi->bi_rw, i);
                        atomic_inc(&sh->count);
                        if (use_new_offset(conf, sh))
-                               bi->bi_sector = (sh->sector
+                               bi->bi_iter.bi_sector = (sh->sector
                                                 + rdev->new_data_offset);
                        else
-                               bi->bi_sector = (sh->sector
+                               bi->bi_iter.bi_sector = (sh->sector
                                                 + rdev->data_offset);
                        if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
                                bi->bi_rw |= REQ_NOMERGE;
                        bi->bi_vcnt = 1;
                        bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                        bi->bi_io_vec[0].bv_offset = 0;
-                       bi->bi_size = STRIPE_SIZE;
+                       bi->bi_iter.bi_size = STRIPE_SIZE;
                        /*
                         * If this is discard request, set bi_vcnt 0. We don't
                         * want to confuse SCSI because SCSI will replace payload
                                rbi->bi_rw, i);
                        atomic_inc(&sh->count);
                        if (use_new_offset(conf, sh))
-                               rbi->bi_sector = (sh->sector
+                               rbi->bi_iter.bi_sector = (sh->sector
                                                  + rrdev->new_data_offset);
                        else
-                               rbi->bi_sector = (sh->sector
+                               rbi->bi_iter.bi_sector = (sh->sector
                                                  + rrdev->data_offset);
                        rbi->bi_vcnt = 1;
                        rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                        rbi->bi_io_vec[0].bv_offset = 0;
-                       rbi->bi_size = STRIPE_SIZE;
+                       rbi->bi_iter.bi_size = STRIPE_SIZE;
                        /*
                         * If this is discard request, set bi_vcnt 0. We don't
                         * want to confuse SCSI because SCSI will replace payload
@@@ -935,24 -934,24 +935,24 @@@ static struct dma_async_tx_descriptor 
  async_copy_data(int frombio, struct bio *bio, struct page *page,
        sector_t sector, struct dma_async_tx_descriptor *tx)
  {
-       struct bio_vec *bvl;
+       struct bio_vec bvl;
+       struct bvec_iter iter;
        struct page *bio_page;
-       int i;
        int page_offset;
        struct async_submit_ctl submit;
        enum async_tx_flags flags = 0;
  
-       if (bio->bi_sector >= sector)
-               page_offset = (signed)(bio->bi_sector - sector) * 512;
+       if (bio->bi_iter.bi_sector >= sector)
+               page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
        else
-               page_offset = (signed)(sector - bio->bi_sector) * -512;
+               page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
  
        if (frombio)
                flags |= ASYNC_TX_FENCE;
        init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
  
-       bio_for_each_segment(bvl, bio, i) {
-               int len = bvl->bv_len;
+       bio_for_each_segment(bvl, bio, iter) {
+               int len = bvl.bv_len;
                int clen;
                int b_offset = 0;
  
                        clen = len;
  
                if (clen > 0) {
-                       b_offset += bvl->bv_offset;
-                       bio_page = bvl->bv_page;
+                       b_offset += bvl.bv_offset;
+                       bio_page = bvl.bv_page;
                        if (frombio)
                                tx = async_memcpy(page, bio_page, page_offset,
                                                  b_offset, clen, &submit);
@@@ -1012,7 -1011,7 +1012,7 @@@ static void ops_complete_biofill(void *
                        BUG_ON(!dev->read);
                        rbi = dev->read;
                        dev->read = NULL;
-                       while (rbi && rbi->bi_sector <
+                       while (rbi && rbi->bi_iter.bi_sector <
                                dev->sector + STRIPE_SECTORS) {
                                rbi2 = r5_next_bio(rbi, dev->sector);
                                if (!raid5_dec_bi_active_stripes(rbi)) {
@@@ -1048,7 -1047,7 +1048,7 @@@ static void ops_run_biofill(struct stri
                        dev->read = rbi = dev->toread;
                        dev->toread = NULL;
                        spin_unlock_irq(&sh->stripe_lock);
-                       while (rbi && rbi->bi_sector <
+                       while (rbi && rbi->bi_iter.bi_sector <
                                dev->sector + STRIPE_SECTORS) {
                                tx = async_copy_data(0, rbi, dev->page,
                                        dev->sector, tx);
@@@ -1390,7 -1389,7 +1390,7 @@@ ops_run_biodrain(struct stripe_head *sh
                        wbi = dev->written = chosen;
                        spin_unlock_irq(&sh->stripe_lock);
  
-                       while (wbi && wbi->bi_sector <
+                       while (wbi && wbi->bi_iter.bi_sector <
                                dev->sector + STRIPE_SECTORS) {
                                if (wbi->bi_rw & REQ_FUA)
                                        set_bit(R5_WantFUA, &dev->flags);
@@@ -2111,7 -2110,6 +2111,7 @@@ static void raid5_end_write_request(str
                        set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
        } else {
                if (!uptodate) {
 +                      set_bit(STRIPE_DEGRADED, &sh->state);
                        set_bit(WriteErrorSeen, &rdev->flags);
                        set_bit(R5_WriteError, &sh->dev[i].flags);
                        if (!test_and_set_bit(WantReplacement, &rdev->flags))
@@@ -2615,7 -2613,7 +2615,7 @@@ static int add_stripe_bio(struct stripe
        int firstwrite=0;
  
        pr_debug("adding bi b#%llu to stripe s#%llu\n",
-               (unsigned long long)bi->bi_sector,
+               (unsigned long long)bi->bi_iter.bi_sector,
                (unsigned long long)sh->sector);
  
        /*
                        firstwrite = 1;
        } else
                bip = &sh->dev[dd_idx].toread;
-       while (*bip && (*bip)->bi_sector < bi->bi_sector) {
-               if (bio_end_sector(*bip) > bi->bi_sector)
+       while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
+               if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
                        goto overlap;
                bip = & (*bip)->bi_next;
        }
-       if (*bip && (*bip)->bi_sector < bio_end_sector(bi))
+       if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
                goto overlap;
  
        BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
                sector_t sector = sh->dev[dd_idx].sector;
                for (bi=sh->dev[dd_idx].towrite;
                     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
-                            bi && bi->bi_sector <= sector;
+                            bi && bi->bi_iter.bi_sector <= sector;
                     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
                        if (bio_end_sector(bi) >= sector)
                                sector = bio_end_sector(bi);
        }
  
        pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
-               (unsigned long long)(*bip)->bi_sector,
+               (unsigned long long)(*bip)->bi_iter.bi_sector,
                (unsigned long long)sh->sector, dd_idx);
        spin_unlock_irq(&sh->stripe_lock);
  
@@@ -2737,7 -2735,7 +2737,7 @@@ handle_failed_stripe(struct r5conf *con
                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
                        wake_up(&conf->wait_for_overlap);
  
-               while (bi && bi->bi_sector <
+               while (bi && bi->bi_iter.bi_sector <
                        sh->dev[i].sector + STRIPE_SECTORS) {
                        struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
                bi = sh->dev[i].written;
                sh->dev[i].written = NULL;
                if (bi) bitmap_end = 1;
-               while (bi && bi->bi_sector <
+               while (bi && bi->bi_iter.bi_sector <
                       sh->dev[i].sector + STRIPE_SECTORS) {
                        struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
                        spin_unlock_irq(&sh->stripe_lock);
                        if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
                                wake_up(&conf->wait_for_overlap);
-                       while (bi && bi->bi_sector <
+                       while (bi && bi->bi_iter.bi_sector <
                               sh->dev[i].sector + STRIPE_SECTORS) {
                                struct bio *nextbi =
                                        r5_next_bio(bi, sh->dev[i].sector);
@@@ -3004,7 -3002,7 +3004,7 @@@ static void handle_stripe_clean_event(s
                                        clear_bit(R5_UPTODATE, &dev->flags);
                                wbi = dev->written;
                                dev->written = NULL;
-                               while (wbi && wbi->bi_sector <
+                               while (wbi && wbi->bi_iter.bi_sector <
                                        dev->sector + STRIPE_SECTORS) {
                                        wbi2 = r5_next_bio(wbi, dev->sector);
                                        if (!raid5_dec_bi_active_stripes(wbi)) {
@@@ -3610,7 -3608,7 +3610,7 @@@ static void analyse_stripe(struct strip
                         */
                        set_bit(R5_Insync, &dev->flags);
  
 -              if (rdev && test_bit(R5_WriteError, &dev->flags)) {
 +              if (test_bit(R5_WriteError, &dev->flags)) {
                        /* This flag does not apply to '.replacement'
                         * only to .rdev, so make sure to check that*/
                        struct md_rdev *rdev2 = rcu_dereference(
                        } else
                                clear_bit(R5_WriteError, &dev->flags);
                }
 -              if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
 +              if (test_bit(R5_MadeGood, &dev->flags)) {
                        /* This flag does not apply to '.replacement'
                         * only to .rdev, so make sure to check that*/
                        struct md_rdev *rdev2 = rcu_dereference(
@@@ -4096,7 -4094,7 +4096,7 @@@ static int raid5_mergeable_bvec(struct 
  
  static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
  {
-       sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+       sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
        unsigned int chunk_sectors = mddev->chunk_sectors;
        unsigned int bio_sectors = bio_sectors(bio);
  
@@@ -4233,9 -4231,9 +4233,9 @@@ static int chunk_aligned_read(struct md
        /*
         *      compute position
         */
-       align_bi->bi_sector =  raid5_compute_sector(conf, raid_bio->bi_sector,
-                                                   0,
-                                                   &dd_idx, NULL);
+       align_bi->bi_iter.bi_sector =
+               raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
+                                    0, &dd_idx, NULL);
  
        end_sector = bio_end_sector(align_bi);
        rcu_read_lock();
                align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
  
                if (!bio_fits_rdev(align_bi) ||
-                   is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi),
+                   is_badblock(rdev, align_bi->bi_iter.bi_sector,
+                               bio_sectors(align_bi),
                                &first_bad, &bad_sectors)) {
                        /* too big in some way, or has a known bad block */
                        bio_put(align_bi);
                }
  
                /* No reshape active, so we can trust rdev->data_offset */
-               align_bi->bi_sector += rdev->data_offset;
+               align_bi->bi_iter.bi_sector += rdev->data_offset;
  
                spin_lock_irq(&conf->device_lock);
                wait_event_lock_irq(conf->wait_for_stripe,
                if (mddev->gendisk)
                        trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
                                              align_bi, disk_devt(mddev->gendisk),
-                                             raid_bio->bi_sector);
+                                             raid_bio->bi_iter.bi_sector);
                generic_make_request(align_bi);
                return 1;
        } else {
@@@ -4464,8 -4463,8 +4465,8 @@@ static void make_discard_request(struc
                /* Skip discard while reshape is happening */
                return;
  
-       logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-       last_sector = bi->bi_sector + (bi->bi_size>>9);
+       logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+       last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
  
        bi->bi_next = NULL;
        bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
@@@ -4569,7 -4568,7 +4570,7 @@@ static void make_request(struct mddev *
                return;
        }
  
-       logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+       logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
        last_sector = bio_end_sector(bi);
        bi->bi_next = NULL;
        bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
@@@ -5053,7 -5052,8 +5054,8 @@@ static int  retry_aligned_read(struct r
        int remaining;
        int handled = 0;
  
-       logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+       logical_sector = raid_bio->bi_iter.bi_sector &
+               ~((sector_t)STRIPE_SECTORS-1);
        sector = raid5_compute_sector(conf, logical_sector,
                                      0, &dd_idx, NULL);
        last_sector = bio_end_sector(raid_bio);
index 58141f0651f280b4cc48b510f09455edf8b6cc0a,3e530f9da8c48a42a2c11b1e4c58b04b284d161c..6969d39f1e2eba7de41856cabc0d1557b7f3efe4
@@@ -184,25 -184,26 +184,26 @@@ static unsigned long xpram_highest_page
  static void xpram_make_request(struct request_queue *q, struct bio *bio)
  {
        xpram_device_t *xdev = bio->bi_bdev->bd_disk->private_data;
-       struct bio_vec *bvec;
+       struct bio_vec bvec;
+       struct bvec_iter iter;
        unsigned int index;
        unsigned long page_addr;
        unsigned long bytes;
-       int i;
  
-       if ((bio->bi_sector & 7) != 0 || (bio->bi_size & 4095) != 0)
+       if ((bio->bi_iter.bi_sector & 7) != 0 ||
+           (bio->bi_iter.bi_size & 4095) != 0)
                /* Request is not page-aligned. */
                goto fail;
-       if ((bio->bi_size >> 12) > xdev->size)
+       if ((bio->bi_iter.bi_size >> 12) > xdev->size)
                /* Request size is no page-aligned. */
                goto fail;
-       if ((bio->bi_sector >> 3) > 0xffffffffU - xdev->offset)
+       if ((bio->bi_iter.bi_sector >> 3) > 0xffffffffU - xdev->offset)
                goto fail;
-       index = (bio->bi_sector >> 3) + xdev->offset;
-       bio_for_each_segment(bvec, bio, i) {
+       index = (bio->bi_iter.bi_sector >> 3) + xdev->offset;
+       bio_for_each_segment(bvec, bio, iter) {
                page_addr = (unsigned long)
-                       kmap(bvec->bv_page) + bvec->bv_offset;
-               bytes = bvec->bv_len;
+                       kmap(bvec.bv_page) + bvec.bv_offset;
+               bytes = bvec.bv_len;
                if ((page_addr & 4095) != 0 || (bytes & 4095) != 0)
                        /* More paranoia. */
                        goto fail;
@@@ -257,7 -258,6 +258,7 @@@ static int __init xpram_setup_sizes(uns
        unsigned long mem_needed;
        unsigned long mem_auto;
        unsigned long long size;
 +      char *sizes_end;
        int mem_auto_no;
        int i;
  
        mem_auto_no = 0;
        for (i = 0; i < xpram_devs; i++) {
                if (sizes[i]) {
 -                      size = simple_strtoull(sizes[i], &sizes[i], 0);
 -                      switch (sizes[i][0]) {
 +                      size = simple_strtoull(sizes[i], &sizes_end, 0);
 +                      switch (*sizes_end) {
                        case 'g':
                        case 'G':
                                size <<= 20;
diff --combined drivers/scsi/sd.c
index 9846c6ab2aaa92eeab130a92fe4d7b8d539b624c,5c8a3b696a1dbf3ab18f914225e36f7c4d0a3a31..470954aba7289a758a650cd82b2f1dfe50ae54f1
@@@ -110,7 -110,7 +110,7 @@@ static int sd_suspend_runtime(struct de
  static int sd_resume(struct device *);
  static void sd_rescan(struct device *);
  static int sd_done(struct scsi_cmnd *);
 -static int sd_eh_action(struct scsi_cmnd *, unsigned char *, int, int);
 +static int sd_eh_action(struct scsi_cmnd *, int);
  static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer);
  static void scsi_disk_release(struct device *cdev);
  static void sd_print_sense_hdr(struct scsi_disk *, struct scsi_sense_hdr *);
@@@ -801,7 -801,7 +801,7 @@@ static int sd_setup_write_same_cmnd(str
        if (sdkp->device->no_write_same)
                return BLKPREP_KILL;
  
-       BUG_ON(bio_offset(bio) || bio_iovec(bio)->bv_len != sdp->sector_size);
+       BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
  
        sector >>= ilog2(sdp->sector_size) - 9;
        nr_sectors >>= ilog2(sdp->sector_size) - 9;
@@@ -1551,23 -1551,23 +1551,23 @@@ static const struct block_device_operat
  /**
   *    sd_eh_action - error handling callback
   *    @scmd:          sd-issued command that has failed
 - *    @eh_cmnd:       The command that was sent during error handling
 - *    @eh_cmnd_len:   Length of eh_cmnd in bytes
   *    @eh_disp:       The recovery disposition suggested by the midlayer
   *
 - *    This function is called by the SCSI midlayer upon completion of
 - *    an error handling command (TEST UNIT READY, START STOP UNIT,
 - *    etc.) The command sent to the device by the error handler is
 - *    stored in eh_cmnd. The result of sending the eh command is
 - *    passed in eh_disp.
 + *    This function is called by the SCSI midlayer upon completion of an
 + *    error test command (currently TEST UNIT READY). The result of sending
 + *    the eh command is passed in eh_disp.  We're looking for devices that
 + *    fail medium access commands but are OK with non access commands like
 + *    test unit ready (so wrongly see the device as having a successful
 + *    recovery)
   **/
 -static int sd_eh_action(struct scsi_cmnd *scmd, unsigned char *eh_cmnd,
 -                      int eh_cmnd_len, int eh_disp)
 +static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp)
  {
        struct scsi_disk *sdkp = scsi_disk(scmd->request->rq_disk);
  
        if (!scsi_device_online(scmd->device) ||
 -          !scsi_medium_access_command(scmd))
 +          !scsi_medium_access_command(scmd) ||
 +          host_byte(scmd->result) != DID_TIME_OUT ||
 +          eh_disp != SUCCESS)
                return eh_disp;
  
        /*
         * process of recovering or has it suffered an internal failure
         * that prevents access to the storage medium.
         */
 -      if (host_byte(scmd->result) == DID_TIME_OUT && eh_disp == SUCCESS &&
 -          eh_cmnd_len && eh_cmnd[0] == TEST_UNIT_READY)
 -              sdkp->medium_access_timed_out++;
 +      sdkp->medium_access_timed_out++;
  
        /*
         * If the device keeps failing read/write commands but TEST UNIT
@@@ -1626,7 -1628,7 +1626,7 @@@ static unsigned int sd_completed_bytes(
                end_lba <<= 1;
        } else {
                /* be careful ... don't want any overflows */
 -              u64 factor = scmd->device->sector_size / 512;
 +              unsigned int factor = scmd->device->sector_size / 512;
                do_div(start_lba, factor);
                do_div(end_lba, factor);
        }
index 5338e8d4c50fa998582fb86209f66c95a11419a8,581ff78be1a2a4b6e19d39d0cd36e5075fd997c0..0718905adeb256cb2a2dd12336f3dbb7db365d23
@@@ -194,10 -194,10 +194,10 @@@ static int do_bio_lustrebacked(struct l
        struct cl_object     *obj = ll_i2info(inode)->lli_clob;
        pgoff_t        offset;
        int                ret;
-       int                i;
        int                rw;
        obd_count            page_count = 0;
-       struct bio_vec       *bvec;
+       struct bio_vec       bvec;
+       struct bvec_iter   iter;
        struct bio         *bio;
        ssize_t        bytes;
  
        for (bio = head; bio != NULL; bio = bio->bi_next) {
                LASSERT(rw == bio->bi_rw);
  
-               offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset;
-               bio_for_each_segment(bvec, bio, i) {
-                       BUG_ON(bvec->bv_offset != 0);
-                       BUG_ON(bvec->bv_len != PAGE_CACHE_SIZE);
+               offset = (pgoff_t)(bio->bi_iter.bi_sector << 9) + lo->lo_offset;
+               bio_for_each_segment(bvec, bio, iter) {
+                       BUG_ON(bvec.bv_offset != 0);
+                       BUG_ON(bvec.bv_len != PAGE_CACHE_SIZE);
  
-                       pages[page_count] = bvec->bv_page;
+                       pages[page_count] = bvec.bv_page;
                        offsets[page_count] = offset;
                        page_count++;
-                       offset += bvec->bv_len;
+                       offset += bvec.bv_len;
                }
                LASSERT(page_count <= LLOOP_MAX_SEGMENTS);
        }
@@@ -313,7 -313,8 +313,8 @@@ static unsigned int loop_get_bio(struc
        bio = &lo->lo_bio;
        while (*bio && (*bio)->bi_rw == rw) {
                CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n",
-                      (unsigned long long)(*bio)->bi_sector, (*bio)->bi_size,
+                      (unsigned long long)(*bio)->bi_iter.bi_sector,
+                      (*bio)->bi_iter.bi_size,
                       page_count, (*bio)->bi_vcnt);
                if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS)
                        break;
@@@ -347,7 -348,8 +348,8 @@@ static void loop_make_request(struct re
                goto err;
  
        CDEBUG(D_INFO, "submit bio sector %llu size %u\n",
-              (unsigned long long)old_bio->bi_sector, old_bio->bi_size);
+              (unsigned long long)old_bio->bi_iter.bi_sector,
+              old_bio->bi_iter.bi_size);
  
        spin_lock_irq(&lo->lo_lock);
        inactive = (lo->lo_state != LLOOP_BOUND);
        loop_add_bio(lo, old_bio);
        return;
  err:
-       cfs_bio_io_error(old_bio, old_bio->bi_size);
+       cfs_bio_io_error(old_bio, old_bio->bi_iter.bi_size);
  }
  
  
@@@ -378,7 -380,7 +380,7 @@@ static inline void loop_handle_bio(stru
        while (bio) {
                struct bio *tmp = bio->bi_next;
                bio->bi_next = NULL;
-               cfs_bio_endio(bio, bio->bi_size, ret);
+               cfs_bio_endio(bio, bio->bi_iter.bi_size, ret);
                bio = tmp;
        }
  }
@@@ -856,8 -858,7 +858,8 @@@ static void lloop_exit(void
  module_init(lloop_init);
  module_exit(lloop_exit);
  
 -CFS_MODULE_PARM(max_loop, "i", int, 0444, "maximum of lloop_device");
 +module_param(max_loop, int, 0444);
 +MODULE_PARM_DESC(max_loop, "maximum of lloop_device");
  MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre virtual block device");
  MODULE_LICENSE("GPL");
diff --combined fs/btrfs/inode.c
index 514b291b135405dd1fbd21f9a8e4edc1b161f5af,7ab0e94ad49244e6167e2c758383c78c233f0cbb..d546d8c3038baa4451aa2f338a0c24592a3ea48f
@@@ -1577,7 -1577,7 +1577,7 @@@ int btrfs_merge_bio_hook(int rw, struc
                         unsigned long bio_flags)
  {
        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-       u64 logical = (u64)bio->bi_sector << 9;
+       u64 logical = (u64)bio->bi_iter.bi_sector << 9;
        u64 length = 0;
        u64 map_length;
        int ret;
        if (bio_flags & EXTENT_BIO_COMPRESSED)
                return 0;
  
-       length = bio->bi_size;
+       length = bio->bi_iter.bi_size;
        map_length = length;
        ret = btrfs_map_block(root->fs_info, rw, logical,
                              &map_length, NULL, 0);
@@@ -4354,12 -4354,8 +4354,12 @@@ static int btrfs_setsize(struct inode *
         * these flags set.  For all other operations the VFS set these flags
         * explicitly if it wants a timestamp update.
         */
 -      if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
 -              inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
 +      if (newsize != oldsize) {
 +              inode_inc_iversion(inode);
 +              if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
 +                      inode->i_ctime = inode->i_mtime =
 +                              current_fs_time(inode->i_sb);
 +      }
  
        if (newsize > oldsize) {
                truncate_pagecache(inode, newsize);
@@@ -4468,7 -4464,7 +4468,7 @@@ static int btrfs_setattr(struct dentry 
                err = btrfs_dirty_inode(inode);
  
                if (!err && attr->ia_valid & ATTR_MODE)
 -                      err = btrfs_acl_chmod(inode);
 +                      err = posix_acl_chmod(inode, inode->i_mode);
        }
  
        return err;
@@@ -6783,17 -6779,16 +6783,16 @@@ unlock_err
  static void btrfs_endio_direct_read(struct bio *bio, int err)
  {
        struct btrfs_dio_private *dip = bio->bi_private;
-       struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
-       struct bio_vec *bvec = bio->bi_io_vec;
+       struct bio_vec *bvec;
        struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct bio *dio_bio;
        u32 *csums = (u32 *)dip->csum;
-       int index = 0;
        u64 start;
+       int i;
  
        start = dip->logical_offset;
-       do {
+       bio_for_each_segment_all(bvec, bio, i) {
                if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
                        struct page *page = bvec->bv_page;
                        char *kaddr;
                        local_irq_restore(flags);
  
                        flush_dcache_page(bvec->bv_page);
-                       if (csum != csums[index]) {
+                       if (csum != csums[i]) {
                                btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
                                          btrfs_ino(inode), start, csum,
-                                         csums[index]);
+                                         csums[i]);
                                err = -EIO;
                        }
                }
  
                start += bvec->bv_len;
-               bvec++;
-               index++;
-       } while (bvec <= bvec_end);
+       }
  
        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
                      dip->logical_offset + dip->bytes - 1);
@@@ -6901,7 -6894,8 +6898,8 @@@ static void btrfs_end_dio_bio(struct bi
                printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
                      "sector %#Lx len %u err no %d\n",
                      btrfs_ino(dip->inode), bio->bi_rw,
-                     (unsigned long long)bio->bi_sector, bio->bi_size, err);
+                     (unsigned long long)bio->bi_iter.bi_sector,
+                     bio->bi_iter.bi_size, err);
                dip->errors = 1;
  
                /*
@@@ -6992,7 -6986,7 +6990,7 @@@ static int btrfs_submit_direct_hook(in
        struct bio *bio;
        struct bio *orig_bio = dip->orig_bio;
        struct bio_vec *bvec = orig_bio->bi_io_vec;
-       u64 start_sector = orig_bio->bi_sector;
+       u64 start_sector = orig_bio->bi_iter.bi_sector;
        u64 file_offset = dip->logical_offset;
        u64 submit_len = 0;
        u64 map_length;
        int ret = 0;
        int async_submit = 0;
  
-       map_length = orig_bio->bi_size;
+       map_length = orig_bio->bi_iter.bi_size;
        ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
                              &map_length, NULL, 0);
        if (ret) {
                return -EIO;
        }
  
-       if (map_length >= orig_bio->bi_size) {
+       if (map_length >= orig_bio->bi_iter.bi_size) {
                bio = orig_bio;
                goto submit;
        }
                        bio->bi_private = dip;
                        bio->bi_end_io = btrfs_end_dio_bio;
  
-                       map_length = orig_bio->bi_size;
+                       map_length = orig_bio->bi_iter.bi_size;
                        ret = btrfs_map_block(root->fs_info, rw,
                                              start_sector << 9,
                                              &map_length, NULL, 0);
@@@ -7118,7 -7112,8 +7116,8 @@@ static void btrfs_submit_direct(int rw
  
        if (!skip_sum && !write) {
                csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-               sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits;
+               sum_len = dio_bio->bi_iter.bi_size >>
+                       inode->i_sb->s_blocksize_bits;
                sum_len *= csum_size;
        } else {
                sum_len = 0;
        dip->private = dio_bio->bi_private;
        dip->inode = inode;
        dip->logical_offset = file_offset;
-       dip->bytes = dio_bio->bi_size;
-       dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
+       dip->bytes = dio_bio->bi_iter.bi_size;
+       dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
        io_bio->bi_private = dip;
        dip->errors = 0;
        dip->orig_bio = io_bio;
@@@ -8653,14 -8648,12 +8652,14 @@@ static const struct inode_operations bt
        .removexattr    = btrfs_removexattr,
        .permission     = btrfs_permission,
        .get_acl        = btrfs_get_acl,
 +      .set_acl        = btrfs_set_acl,
        .update_time    = btrfs_update_time,
  };
  static const struct inode_operations btrfs_dir_ro_inode_operations = {
        .lookup         = btrfs_lookup,
        .permission     = btrfs_permission,
        .get_acl        = btrfs_get_acl,
 +      .set_acl        = btrfs_set_acl,
        .update_time    = btrfs_update_time,
  };
  
@@@ -8730,7 -8723,6 +8729,7 @@@ static const struct inode_operations bt
        .permission     = btrfs_permission,
        .fiemap         = btrfs_fiemap,
        .get_acl        = btrfs_get_acl,
 +      .set_acl        = btrfs_set_acl,
        .update_time    = btrfs_update_time,
  };
  static const struct inode_operations btrfs_special_inode_operations = {
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
        .get_acl        = btrfs_get_acl,
 +      .set_acl        = btrfs_set_acl,
        .update_time    = btrfs_update_time,
  };
  static const struct inode_operations btrfs_symlink_inode_operations = {
        .getxattr       = btrfs_getxattr,
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
 -      .get_acl        = btrfs_get_acl,
        .update_time    = btrfs_update_time,
  };
  
diff --combined fs/f2fs/data.c
index 0ae558723506e1a8a96f5653444dc11f5a8feb27,a2c8de8ba6ce6d45e9450de5ab8b6c60fb01adea..2261ccdd0b5f04a37be390f1b28c8703fafa86b4
  #include "segment.h"
  #include <trace/events/f2fs.h>
  
-       const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-       struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 +static void f2fs_read_end_io(struct bio *bio, int err)
 +{
-       do {
++      struct bio_vec *bvec;
++      int i;
 +
-               if (--bvec >= bio->bi_io_vec)
-                       prefetchw(&bvec->bv_page->flags);
-               if (unlikely(!uptodate)) {
++      bio_for_each_segment_all(bvec, bio, i) {
 +              struct page *page = bvec->bv_page;
 +
-               } else {
-                       SetPageUptodate(page);
++              if (!err) {
++                      SetPageUptodate(page);
++              } else {
 +                      ClearPageUptodate(page);
 +                      SetPageError(page);
-       } while (bvec >= bio->bi_io_vec);
 +              }
 +              unlock_page(page);
-       const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-       struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-       struct f2fs_sb_info *sbi = F2FS_SB(bvec->bv_page->mapping->host->i_sb);
++      }
 +      bio_put(bio);
 +}
 +
 +static void f2fs_write_end_io(struct bio *bio, int err)
 +{
-       do {
++      struct f2fs_sb_info *sbi = F2FS_SB(bio->bi_io_vec->bv_page->mapping->host->i_sb);
++      struct bio_vec *bvec;
++      int i;
 +
-               if (--bvec >= bio->bi_io_vec)
-                       prefetchw(&bvec->bv_page->flags);
-               if (unlikely(!uptodate)) {
++      bio_for_each_segment_all(bvec, bio, i) {
 +              struct page *page = bvec->bv_page;
 +
-       } while (bvec >= bio->bi_io_vec);
++              if (unlikely(err)) {
 +                      SetPageError(page);
 +                      set_bit(AS_EIO, &page->mapping->flags);
 +                      set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
 +                      sbi->sb->s_flags |= MS_RDONLY;
 +              }
 +              end_page_writeback(page);
 +              dec_page_count(sbi, F2FS_WRITEBACK);
-       bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
++      }
 +
 +      if (bio->bi_private)
 +              complete(bio->bi_private);
 +
 +      if (!get_pages(sbi, F2FS_WRITEBACK) &&
 +                      !list_empty(&sbi->cp_wait.task_list))
 +              wake_up(&sbi->cp_wait);
 +
 +      bio_put(bio);
 +}
 +
 +/*
 + * Low-level block read/write IO operations.
 + */
 +static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
 +                              int npages, bool is_read)
 +{
 +      struct bio *bio;
 +
 +      /* No failure on bio allocation */
 +      bio = bio_alloc(GFP_NOIO, npages);
 +
 +      bio->bi_bdev = sbi->sb->s_bdev;
++      bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
 +      bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
 +
 +      return bio;
 +}
 +
 +static void __submit_merged_bio(struct f2fs_bio_info *io)
 +{
 +      struct f2fs_io_info *fio = &io->fio;
 +      int rw;
 +
 +      if (!io->bio)
 +              return;
 +
 +      rw = fio->rw;
 +
 +      if (is_read_io(rw)) {
 +              trace_f2fs_submit_read_bio(io->sbi->sb, rw,
 +                                              fio->type, io->bio);
 +              submit_bio(rw, io->bio);
 +      } else {
 +              trace_f2fs_submit_write_bio(io->sbi->sb, rw,
 +                                              fio->type, io->bio);
 +              /*
 +               * META_FLUSH is only from the checkpoint procedure, and we
 +               * should wait this metadata bio for FS consistency.
 +               */
 +              if (fio->type == META_FLUSH) {
 +                      DECLARE_COMPLETION_ONSTACK(wait);
 +                      io->bio->bi_private = &wait;
 +                      submit_bio(rw, io->bio);
 +                      wait_for_completion(&wait);
 +              } else {
 +                      submit_bio(rw, io->bio);
 +              }
 +      }
 +
 +      io->bio = NULL;
 +}
 +
 +void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
 +                              enum page_type type, int rw)
 +{
 +      enum page_type btype = PAGE_TYPE_OF_BIO(type);
 +      struct f2fs_bio_info *io;
 +
 +      io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
 +
 +      mutex_lock(&io->io_mutex);
 +
 +      /* change META to META_FLUSH in the checkpoint procedure */
 +      if (type >= META_FLUSH) {
 +              io->fio.type = META_FLUSH;
 +              io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
 +      }
 +      __submit_merged_bio(io);
 +      mutex_unlock(&io->io_mutex);
 +}
 +
 +/*
 + * Fill the locked page with data located in the block address.
 + * Return unlocked page.
 + */
 +int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
 +                                      block_t blk_addr, int rw)
 +{
 +      struct bio *bio;
 +
 +      trace_f2fs_submit_page_bio(page, blk_addr, rw);
 +
 +      /* Allocate a new bio */
 +      bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw));
 +
 +      if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
 +              bio_put(bio);
 +              f2fs_put_page(page, 1);
 +              return -EFAULT;
 +      }
 +
 +      submit_bio(rw, bio);
 +      return 0;
 +}
 +
 +void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
 +                      block_t blk_addr, struct f2fs_io_info *fio)
 +{
 +      enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
 +      struct f2fs_bio_info *io;
 +      bool is_read = is_read_io(fio->rw);
 +
 +      io = is_read ? &sbi->read_io : &sbi->write_io[btype];
 +
 +      verify_block_addr(sbi, blk_addr);
 +
 +      mutex_lock(&io->io_mutex);
 +
 +      if (!is_read)
 +              inc_page_count(sbi, F2FS_WRITEBACK);
 +
 +      if (io->bio && (io->last_block_in_bio != blk_addr - 1 ||
 +                                              io->fio.rw != fio->rw))
 +              __submit_merged_bio(io);
 +alloc_new:
 +      if (io->bio == NULL) {
 +              int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
 +
 +              io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
 +              io->fio = *fio;
 +      }
 +
 +      if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
 +                                                      PAGE_CACHE_SIZE) {
 +              __submit_merged_bio(io);
 +              goto alloc_new;
 +      }
 +
 +      io->last_block_in_bio = blk_addr;
 +
 +      mutex_unlock(&io->io_mutex);
 +      trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr);
 +}
 +
  /*
   * Lock ordering for the change of data block address:
   * ->data_page
@@@ -226,7 -37,7 +219,7 @@@ static void __set_data_blkaddr(struct d
        struct page *node_page = dn->node_page;
        unsigned int ofs_in_node = dn->ofs_in_node;
  
 -      f2fs_wait_on_page_writeback(node_page, NODE, false);
 +      f2fs_wait_on_page_writeback(node_page, NODE);
  
        rn = F2FS_NODE(node_page);
  
@@@ -240,39 -51,19 +233,39 @@@ int reserve_new_block(struct dnode_of_d
  {
        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
  
 -      if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
 +      if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
                return -EPERM;
 -      if (!inc_valid_block_count(sbi, dn->inode, 1))
 +      if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
                return -ENOSPC;
  
        trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
  
        __set_data_blkaddr(dn, NEW_ADDR);
        dn->data_blkaddr = NEW_ADDR;
 +      mark_inode_dirty(dn->inode);
        sync_inode_page(dn);
        return 0;
  }
  
 +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
 +{
 +      bool need_put = dn->inode_page ? false : true;
 +      int err;
 +
 +      /* if inode_page exists, index should be zero */
 +      f2fs_bug_on(!need_put && index);
 +
 +      err = get_dnode_of_data(dn, index, ALLOC_NODE);
 +      if (err)
 +              return err;
 +
 +      if (dn->data_blkaddr == NULL_ADDR)
 +              err = reserve_new_block(dn);
 +      if (err || need_put)
 +              f2fs_put_dnode(dn);
 +      return err;
 +}
 +
  static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
                                        struct buffer_head *bh_result)
  {
        pgoff_t start_fofs, end_fofs;
        block_t start_blkaddr;
  
 +      if (is_inode_flag_set(fi, FI_NO_EXTENT))
 +              return 0;
 +
        read_lock(&fi->ext.ext_lock);
        if (fi->ext.len == 0) {
                read_unlock(&fi->ext.ext_lock);
@@@ -321,7 -109,6 +314,7 @@@ void update_extent_cache(block_t blk_ad
        struct f2fs_inode_info *fi = F2FS_I(dn->inode);
        pgoff_t fofs, start_fofs, end_fofs;
        block_t start_blkaddr, end_blkaddr;
 +      int need_update = true;
  
        f2fs_bug_on(blk_addr == NEW_ADDR);
        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
        /* Update the page address in the parent node */
        __set_data_blkaddr(dn, blk_addr);
  
 +      if (is_inode_flag_set(fi, FI_NO_EXTENT))
 +              return;
 +
        write_lock(&fi->ext.ext_lock);
  
        start_fofs = fi->ext.fofs;
                                        fofs - start_fofs + 1;
                        fi->ext.len -= fofs - start_fofs + 1;
                }
 -              goto end_update;
 +      } else {
 +              need_update = false;
        }
 -      write_unlock(&fi->ext.ext_lock);
 -      return;
  
 +      /* Finally, if the extent is very fragmented, let's drop the cache. */
 +      if (fi->ext.len < F2FS_MIN_EXTENT_LEN) {
 +              fi->ext.len = 0;
 +              set_inode_flag(fi, FI_NO_EXTENT);
 +              need_update = true;
 +      }
  end_update:
        write_unlock(&fi->ext.ext_lock);
 -      sync_inode_page(dn);
 +      if (need_update)
 +              sync_inode_page(dn);
 +      return;
  }
  
  struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
                return ERR_PTR(-ENOENT);
  
        /* By fallocate(), there is no cached page, but with NEW_ADDR */
 -      if (dn.data_blkaddr == NEW_ADDR)
 +      if (unlikely(dn.data_blkaddr == NEW_ADDR))
                return ERR_PTR(-EINVAL);
  
        page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
                return page;
        }
  
 -      err = f2fs_readpage(sbi, page, dn.data_blkaddr,
 +      err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
                                        sync ? READ_SYNC : READA);
 +      if (err)
 +              return ERR_PTR(err);
 +
        if (sync) {
                wait_on_page_locked(page);
 -              if (!PageUptodate(page)) {
 +              if (unlikely(!PageUptodate(page))) {
                        f2fs_put_page(page, 0);
                        return ERR_PTR(-EIO);
                }
@@@ -472,7 -246,7 +465,7 @@@ repeat
        }
        f2fs_put_dnode(&dn);
  
 -      if (dn.data_blkaddr == NULL_ADDR) {
 +      if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
                f2fs_put_page(page, 1);
                return ERR_PTR(-ENOENT);
        }
                return page;
        }
  
 -      err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
 +      err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC);
        if (err)
                return ERR_PTR(err);
  
        lock_page(page);
 -      if (!PageUptodate(page)) {
 +      if (unlikely(!PageUptodate(page))) {
                f2fs_put_page(page, 1);
                return ERR_PTR(-EIO);
        }
 -      if (page->mapping != mapping) {
 +      if (unlikely(page->mapping != mapping)) {
                f2fs_put_page(page, 1);
                goto repeat;
        }
   * Caller ensures that this data page is never allocated.
   * A new zero-filled data page is allocated in the page cache.
   *
 - * Also, caller should grab and release a mutex by calling mutex_lock_op() and
 - * mutex_unlock_op().
 - * Note that, npage is set only by make_empty_dir.
 + * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and
 + * f2fs_unlock_op().
 + * Note that, ipage is set only by make_empty_dir.
   */
  struct page *get_new_data_page(struct inode *inode,
 -              struct page *npage, pgoff_t index, bool new_i_size)
 +              struct page *ipage, pgoff_t index, bool new_i_size)
  {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct address_space *mapping = inode->i_mapping;
        struct dnode_of_data dn;
        int err;
  
 -      set_new_dnode(&dn, inode, npage, npage, 0);
 -      err = get_dnode_of_data(&dn, index, ALLOC_NODE);
 +      set_new_dnode(&dn, inode, ipage, NULL, 0);
 +      err = f2fs_reserve_block(&dn, index);
        if (err)
                return ERR_PTR(err);
 -
 -      if (dn.data_blkaddr == NULL_ADDR) {
 -              if (reserve_new_block(&dn)) {
 -                      if (!npage)
 -                              f2fs_put_dnode(&dn);
 -                      return ERR_PTR(-ENOSPC);
 -              }
 -      }
 -      if (!npage)
 -              f2fs_put_dnode(&dn);
  repeat:
        page = grab_cache_page(mapping, index);
 -      if (!page)
 -              return ERR_PTR(-ENOMEM);
 +      if (!page) {
 +              err = -ENOMEM;
 +              goto put_err;
 +      }
  
        if (PageUptodate(page))
                return page;
                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
                SetPageUptodate(page);
        } else {
 -              err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
 +              err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
 +                                                              READ_SYNC);
                if (err)
 -                      return ERR_PTR(err);
 +                      goto put_err;
 +
                lock_page(page);
 -              if (!PageUptodate(page)) {
 +              if (unlikely(!PageUptodate(page))) {
                        f2fs_put_page(page, 1);
 -                      return ERR_PTR(-EIO);
 +                      err = -EIO;
 +                      goto put_err;
                }
 -              if (page->mapping != mapping) {
 +              if (unlikely(page->mapping != mapping)) {
                        f2fs_put_page(page, 1);
                        goto repeat;
                }
                i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
                /* Only the directory inode sets new_i_size */
                set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
 -              mark_inode_dirty_sync(inode);
        }
        return page;
 -}
 -
 -static void read_end_io(struct bio *bio, int err)
 -{
 -      struct bio_vec *bvec;
 -      int i;
 -
 -      bio_for_each_segment_all(bvec, bio, i) {
 -              struct page *page = bvec->bv_page;
  
 -              if (!err) {
 -                      SetPageUptodate(page);
 -              } else {
 -                      ClearPageUptodate(page);
 -                      SetPageError(page);
 -              }
 -              unlock_page(page);
 -      }
 -      bio_put(bio);
 +put_err:
 +      f2fs_put_dnode(&dn);
 +      return ERR_PTR(err);
  }
  
 -/*
 - * Fill the locked page with data located in the block address.
 - * Return unlocked page.
 - */
 -int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
 -                                      block_t blk_addr, int type)
 +static int __allocate_data_block(struct dnode_of_data *dn)
  {
 -      struct block_device *bdev = sbi->sb->s_bdev;
 -      struct bio *bio;
 +      struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
 +      struct f2fs_summary sum;
 +      block_t new_blkaddr;
 +      struct node_info ni;
 +      int type;
  
 -      trace_f2fs_readpage(page, blk_addr, type);
 +      if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
 +              return -EPERM;
 +      if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
 +              return -ENOSPC;
  
 -      down_read(&sbi->bio_sem);
 +      __set_data_blkaddr(dn, NEW_ADDR);
 +      dn->data_blkaddr = NEW_ADDR;
  
 -      /* Allocate a new bio */
 -      bio = f2fs_bio_alloc(bdev, 1);
 +      get_node_info(sbi, dn->nid, &ni);
 +      set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
  
 -      /* Initialize the bio */
 -      bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
 -      bio->bi_end_io = read_end_io;
 +      type = CURSEG_WARM_DATA;
  
 -      if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
 -              bio_put(bio);
 -              up_read(&sbi->bio_sem);
 -              f2fs_put_page(page, 1);
 -              return -EFAULT;
 -      }
 +      allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type);
  
 -      submit_bio(type, bio);
 -      up_read(&sbi->bio_sem);
 +      /* direct IO doesn't use extent cache to maximize the performance */
 +      set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
 +      update_extent_cache(new_blkaddr, dn);
 +      clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
 +
 +      dn->data_blkaddr = new_blkaddr;
        return 0;
  }
  
  /*
 - * This function should be used by the data read flow only where it
 - * does not check the "create" flag that indicates block allocation.
 - * The reason for this special functionality is to exploit VFS readahead
 - * mechanism.
 + * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
 + * If original data blocks are allocated, then give them to blockdev.
 + * Otherwise,
 + *     a. preallocate requested block addresses
 + *     b. do not use extent cache for better performance
 + *     c. give the block addresses to blockdev
   */
 -static int get_data_block_ro(struct inode *inode, sector_t iblock,
 +static int get_data_block(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create)
  {
 +      struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        unsigned int blkbits = inode->i_sb->s_blocksize_bits;
        unsigned maxblocks = bh_result->b_size >> blkbits;
        struct dnode_of_data dn;
 -      pgoff_t pgofs;
 -      int err;
 +      int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
 +      pgoff_t pgofs, end_offset;
 +      int err = 0, ofs = 1;
 +      bool allocated = false;
  
        /* Get the page offset from the block offset(iblock) */
        pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
  
 -      if (check_extent_cache(inode, pgofs, bh_result)) {
 -              trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
 -              return 0;
 -      }
 +      if (check_extent_cache(inode, pgofs, bh_result))
 +              goto out;
 +
 +      if (create)
 +              f2fs_lock_op(sbi);
  
        /* When reading holes, we need its node page */
        set_new_dnode(&dn, inode, NULL, NULL, 0);
 -      err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
 +      err = get_dnode_of_data(&dn, pgofs, mode);
        if (err) {
 -              trace_f2fs_get_data_block(inode, iblock, bh_result, err);
 -              return (err == -ENOENT) ? 0 : err;
 +              if (err == -ENOENT)
 +                      err = 0;
 +              goto unlock_out;
        }
 +      if (dn.data_blkaddr == NEW_ADDR)
 +              goto put_out;
  
 -      /* It does not support data allocation */
 -      f2fs_bug_on(create);
 +      if (dn.data_blkaddr != NULL_ADDR) {
 +              map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
 +      } else if (create) {
 +              err = __allocate_data_block(&dn);
 +              if (err)
 +                      goto put_out;
 +              allocated = true;
 +              map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
 +      } else {
 +              goto put_out;
 +      }
  
 -      if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) {
 -              int i;
 -              unsigned int end_offset;
 +      end_offset = IS_INODE(dn.node_page) ?
 +                      ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
 +      bh_result->b_size = (((size_t)1) << blkbits);
 +      dn.ofs_in_node++;
 +      pgofs++;
 +
 +get_next:
 +      if (dn.ofs_in_node >= end_offset) {
 +              if (allocated)
 +                      sync_inode_page(&dn);
 +              allocated = false;
 +              f2fs_put_dnode(&dn);
  
 -              end_offset = IS_INODE(dn.node_page) ?
 -                              ADDRS_PER_INODE(F2FS_I(inode)) :
 -                              ADDRS_PER_BLOCK;
 +              set_new_dnode(&dn, inode, NULL, NULL, 0);
 +              err = get_dnode_of_data(&dn, pgofs, mode);
 +              if (err) {
 +                      if (err == -ENOENT)
 +                              err = 0;
 +                      goto unlock_out;
 +              }
 +              if (dn.data_blkaddr == NEW_ADDR)
 +                      goto put_out;
  
 -              clear_buffer_new(bh_result);
 +              end_offset = IS_INODE(dn.node_page) ?
 +                      ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
 +      }
  
 +      if (maxblocks > (bh_result->b_size >> blkbits)) {
 +              block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
 +              if (blkaddr == NULL_ADDR && create) {
 +                      err = __allocate_data_block(&dn);
 +                      if (err)
 +                              goto sync_out;
 +                      allocated = true;
 +                      blkaddr = dn.data_blkaddr;
 +              }
                /* Give more consecutive addresses for the read ahead */
 -              for (i = 0; i < end_offset - dn.ofs_in_node; i++)
 -                      if (((datablock_addr(dn.node_page,
 -                                                      dn.ofs_in_node + i))
 -                              != (dn.data_blkaddr + i)) || maxblocks == i)
 -                              break;
 -              map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
 -              bh_result->b_size = (i << blkbits);
 +              if (blkaddr == (bh_result->b_blocknr + ofs)) {
 +                      ofs++;
 +                      dn.ofs_in_node++;
 +                      pgofs++;
 +                      bh_result->b_size += (((size_t)1) << blkbits);
 +                      goto get_next;
 +              }
        }
 +sync_out:
 +      if (allocated)
 +              sync_inode_page(&dn);
 +put_out:
        f2fs_put_dnode(&dn);
 -      trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
 -      return 0;
 +unlock_out:
 +      if (create)
 +              f2fs_unlock_op(sbi);
 +out:
 +      trace_f2fs_get_data_block(inode, iblock, bh_result, err);
 +      return err;
  }
  
  static int f2fs_read_data_page(struct file *file, struct page *page)
  {
 -      return mpage_readpage(page, get_data_block_ro);
 +      struct inode *inode = page->mapping->host;
 +      int ret;
 +
 +      /* If the file has inline data, try to read it directlly */
 +      if (f2fs_has_inline_data(inode))
 +              ret = f2fs_read_inline_data(inode, page);
 +      else
 +              ret = mpage_readpage(page, get_data_block);
 +
 +      return ret;
  }
  
  static int f2fs_read_data_pages(struct file *file,
                        struct address_space *mapping,
                        struct list_head *pages, unsigned nr_pages)
  {
 -      return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro);
 +      struct inode *inode = file->f_mapping->host;
 +
 +      /* If the file has inline data, skip readpages */
 +      if (f2fs_has_inline_data(inode))
 +              return 0;
 +
 +      return mpage_readpages(mapping, pages, nr_pages, get_data_block);
  }
  
 -int do_write_data_page(struct page *page)
 +int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
  {
        struct inode *inode = page->mapping->host;
 -      block_t old_blk_addr, new_blk_addr;
 +      block_t old_blkaddr, new_blkaddr;
        struct dnode_of_data dn;
        int err = 0;
  
        if (err)
                return err;
  
 -      old_blk_addr = dn.data_blkaddr;
 +      old_blkaddr = dn.data_blkaddr;
  
        /* This page is already truncated */
 -      if (old_blk_addr == NULL_ADDR)
 +      if (old_blkaddr == NULL_ADDR)
                goto out_writepage;
  
        set_page_writeback(page);
         * If current allocation needs SSR,
         * it had better in-place writes for updated data.
         */
 -      if (unlikely(old_blk_addr != NEW_ADDR &&
 +      if (unlikely(old_blkaddr != NEW_ADDR &&
                        !is_cold_data(page) &&
                        need_inplace_update(inode))) {
 -              rewrite_data_page(F2FS_SB(inode->i_sb), page,
 -                                              old_blk_addr);
 +              rewrite_data_page(page, old_blkaddr, fio);
        } else {
 -              write_data_page(inode, page, &dn,
 -                              old_blk_addr, &new_blk_addr);
 -              update_extent_cache(new_blk_addr, &dn);
 +              write_data_page(page, &dn, &new_blkaddr, fio);
 +              update_extent_cache(new_blkaddr, &dn);
        }
  out_writepage:
        f2fs_put_dnode(&dn);
@@@ -787,13 -518,9 +780,13 @@@ static int f2fs_write_data_page(struct 
        loff_t i_size = i_size_read(inode);
        const pgoff_t end_index = ((unsigned long long) i_size)
                                                        >> PAGE_CACHE_SHIFT;
 -      unsigned offset;
 +      unsigned offset = 0;
        bool need_balance_fs = false;
        int err = 0;
 +      struct f2fs_io_info fio = {
 +              .type = DATA,
 +              .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
 +      };
  
        if (page->index < end_index)
                goto write;
  
        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
  write:
 -      if (sbi->por_doing) {
 +      if (unlikely(sbi->por_doing)) {
                err = AOP_WRITEPAGE_ACTIVATE;
                goto redirty_out;
        }
        if (S_ISDIR(inode->i_mode)) {
                dec_page_count(sbi, F2FS_DIRTY_DENTS);
                inode_dec_dirty_dents(inode);
 -              err = do_write_data_page(page);
 +              err = do_write_data_page(page, &fio);
        } else {
                f2fs_lock_op(sbi);
 -              err = do_write_data_page(page);
 +
 +              if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) {
 +                      err = f2fs_write_inline_data(inode, page, offset);
 +                      f2fs_unlock_op(sbi);
 +                      goto out;
 +              } else {
 +                      err = do_write_data_page(page, &fio);
 +              }
 +
                f2fs_unlock_op(sbi);
                need_balance_fs = true;
        }
        else if (err)
                goto redirty_out;
  
 -      if (wbc->for_reclaim)
 -              f2fs_submit_bio(sbi, DATA, true);
 +      if (wbc->for_reclaim) {
 +              f2fs_submit_merged_bio(sbi, DATA, WRITE);
 +              need_balance_fs = false;
 +      }
  
        clear_cold_data(page);
  out:
@@@ -897,8 -614,7 +890,8 @@@ static int f2fs_write_data_pages(struc
        ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
        if (locked)
                mutex_unlock(&sbi->writepages);
 -      f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL));
 +
 +      f2fs_submit_merged_bio(sbi, DATA, WRITE);
  
        remove_dirty_dir_inode(inode);
  
@@@ -919,28 -635,27 +912,28 @@@ static int f2fs_write_begin(struct fil
  
        f2fs_balance_fs(sbi);
  repeat:
 +      err = f2fs_convert_inline_data(inode, pos + len);
 +      if (err)
 +              return err;
 +
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
  
 -      f2fs_lock_op(sbi);
 +      if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA)
 +              goto inline_data;
  
 +      f2fs_lock_op(sbi);
        set_new_dnode(&dn, inode, NULL, NULL, 0);
 -      err = get_dnode_of_data(&dn, index, ALLOC_NODE);
 -      if (err)
 -              goto err;
 -
 -      if (dn.data_blkaddr == NULL_ADDR)
 -              err = reserve_new_block(&dn);
 -
 -      f2fs_put_dnode(&dn);
 -      if (err)
 -              goto err;
 -
 +      err = f2fs_reserve_block(&dn, index);
        f2fs_unlock_op(sbi);
  
 +      if (err) {
 +              f2fs_put_page(page, 1);
 +              return err;
 +      }
 +inline_data:
        if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
                return 0;
  
        if (dn.data_blkaddr == NEW_ADDR) {
                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
        } else {
 -              err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
 +              if (f2fs_has_inline_data(inode))
 +                      err = f2fs_read_inline_data(inode, page);
 +              else
 +                      err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
 +                                                      READ_SYNC);
                if (err)
                        return err;
                lock_page(page);
 -              if (!PageUptodate(page)) {
 +              if (unlikely(!PageUptodate(page))) {
                        f2fs_put_page(page, 1);
                        return -EIO;
                }
 -              if (page->mapping != mapping) {
 +              if (unlikely(page->mapping != mapping)) {
                        f2fs_put_page(page, 1);
                        goto repeat;
                }
@@@ -977,6 -688,11 +970,6 @@@ out
        SetPageUptodate(page);
        clear_cold_data(page);
        return 0;
 -
 -err:
 -      f2fs_unlock_op(sbi);
 -      f2fs_put_page(page, 1);
 -      return err;
  }
  
  static int f2fs_write_end(struct file *file,
                update_inode_page(inode);
        }
  
 -      unlock_page(page);
 -      page_cache_release(page);
 +      f2fs_put_page(page, 1);
        return copied;
  }
  
 +static int check_direct_IO(struct inode *inode, int rw,
 +              const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 +{
 +      unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
 +      int i;
 +
 +      if (rw == READ)
 +              return 0;
 +
 +      if (offset & blocksize_mask)
 +              return -EINVAL;
 +
 +      for (i = 0; i < nr_segs; i++)
 +              if (iov[i].iov_len & blocksize_mask)
 +                      return -EINVAL;
 +      return 0;
 +}
 +
  static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
                const struct iovec *iov, loff_t offset, unsigned long nr_segs)
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
  
 -      if (rw == WRITE)
 +      /* Let buffer I/O handle the inline data case. */
 +      if (f2fs_has_inline_data(inode))
 +              return 0;
 +
 +      if (check_direct_IO(inode, rw, iov, offset, nr_segs))
                return 0;
  
 -      /* Needs synchronization with the cleaner */
        return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
 -                                                get_data_block_ro);
 +                                                      get_data_block);
  }
  
  static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
@@@ -1060,8 -756,6 +1053,8 @@@ static int f2fs_set_data_page_dirty(str
        trace_f2fs_set_page_dirty(page, DATA);
  
        SetPageUptodate(page);
 +      mark_inode_dirty(inode);
 +
        if (!PageDirty(page)) {
                __set_page_dirty_nobuffers(page);
                set_dirty_dir_page(inode, page);
  
  static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
  {
 -      return generic_block_bmap(mapping, block, get_data_block_ro);
 +      return generic_block_bmap(mapping, block, get_data_block);
  }
  
  const struct address_space_operations f2fs_dblock_aops = {
diff --combined fs/gfs2/lops.c
index 58f06400b7b8dcece9597b51b05ebf9ebc092396,985da945f0b57cc1b77465cb12bf53b624806955..76693793ceddfe7f936c360a6c3494d1882a849a
@@@ -83,7 -83,6 +83,7 @@@ static void maybe_release_space(struct 
               bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
        clear_bit(GBF_FULL, &bi->bi_flags);
        rgd->rd_free_clone = rgd->rd_free;
 +      rgd->rd_extfail_pt = rgd->rd_free;
  }
  
  /**
@@@ -273,7 -272,7 +273,7 @@@ static struct bio *gfs2_log_alloc_bio(s
                nrvecs = max(nrvecs/2, 1U);
        }
  
-       bio->bi_sector = blkno * (sb->s_blocksize >> 9);
+       bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
        bio->bi_bdev = sb->s_bdev;
        bio->bi_end_io = gfs2_end_log_write;
        bio->bi_private = sdp;
@@@ -589,12 -588,8 +589,12 @@@ static int buf_lo_scan_elements(struct 
  static void gfs2_meta_sync(struct gfs2_glock *gl)
  {
        struct address_space *mapping = gfs2_glock2aspace(gl);
 +      struct gfs2_sbd *sdp = gl->gl_sbd;
        int error;
  
 +      if (mapping == NULL)
 +              mapping = &sdp->sd_aspace;
 +
        filemap_fdatawrite(mapping);
        error = filemap_fdatawait(mapping);
  
diff --combined fs/gfs2/ops_fstype.c
index 1e712b566d76a74435b4d2faa5417956815cec78,16194da91652becfca5019a296f097206358e1d2..c6872d09561a2d53c8e57374eb700f4fb578ae78
@@@ -36,7 -36,6 +36,7 @@@
  #include "log.h"
  #include "quota.h"
  #include "dir.h"
 +#include "meta_io.h"
  #include "trace_gfs2.h"
  
  #define DO 0
@@@ -63,7 -62,6 +63,7 @@@ static void gfs2_tune_init(struct gfs2_
  static struct gfs2_sbd *init_sbd(struct super_block *sb)
  {
        struct gfs2_sbd *sdp;
 +      struct address_space *mapping;
  
        sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
        if (!sdp)
        init_waitqueue_head(&sdp->sd_quota_wait);
        INIT_LIST_HEAD(&sdp->sd_trunc_list);
        spin_lock_init(&sdp->sd_trunc_lock);
 +      spin_lock_init(&sdp->sd_bitmap_lock);
 +
 +      mapping = &sdp->sd_aspace;
 +
 +      address_space_init_once(mapping);
 +      mapping->a_ops = &gfs2_meta_aops;
 +      mapping->host = sb->s_bdev->bd_inode;
 +      mapping->flags = 0;
 +      mapping_set_gfp_mask(mapping, GFP_NOFS);
 +      mapping->private_data = NULL;
 +      mapping->backing_dev_info = sb->s_bdi;
 +      mapping->writeback_index = 0;
  
        spin_lock_init(&sdp->sd_log_lock);
        atomic_set(&sdp->sd_log_pinned, 0);
@@@ -231,14 -217,14 +231,14 @@@ static int gfs2_read_super(struct gfs2_
  
        page = alloc_page(GFP_NOFS);
        if (unlikely(!page))
 -              return -ENOBUFS;
 +              return -ENOMEM;
  
        ClearPageUptodate(page);
        ClearPageDirty(page);
        lock_page(page);
  
        bio = bio_alloc(GFP_NOFS, 1);
-       bio->bi_sector = sector * (sb->s_blocksize >> 9);
+       bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
        bio->bi_bdev = sb->s_bdev;
        bio_add_page(bio, page, PAGE_SIZE, 0);
  
@@@ -970,6 -956,40 +970,6 @@@ fail
        return error;
  }
  
 -static int init_threads(struct gfs2_sbd *sdp, int undo)
 -{
 -      struct task_struct *p;
 -      int error = 0;
 -
 -      if (undo)
 -              goto fail_quotad;
 -
 -      p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
 -      if (IS_ERR(p)) {
 -              error = PTR_ERR(p);
 -              fs_err(sdp, "can't start logd thread: %d\n", error);
 -              return error;
 -      }
 -      sdp->sd_logd_process = p;
 -
 -      p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
 -      if (IS_ERR(p)) {
 -              error = PTR_ERR(p);
 -              fs_err(sdp, "can't start quotad thread: %d\n", error);
 -              goto fail;
 -      }
 -      sdp->sd_quotad_process = p;
 -
 -      return 0;
 -
 -
 -fail_quotad:
 -      kthread_stop(sdp->sd_quotad_process);
 -fail:
 -      kthread_stop(sdp->sd_logd_process);
 -      return error;
 -}
 -
  static const match_table_t nolock_tokens = {
        { Opt_jid, "jid=%d\n", },
        { Opt_err, NULL },
@@@ -1234,11 -1254,15 +1234,11 @@@ static int fill_super(struct super_bloc
                goto fail_per_node;
        }
  
 -      error = init_threads(sdp, DO);
 -      if (error)
 -              goto fail_per_node;
 -
        if (!(sb->s_flags & MS_RDONLY)) {
                error = gfs2_make_fs_rw(sdp);
                if (error) {
                        fs_err(sdp, "can't make FS RW: %d\n", error);
 -                      goto fail_threads;
 +                      goto fail_per_node;
                }
        }
  
        gfs2_online_uevent(sdp);
        return 0;
  
 -fail_threads:
 -      init_threads(sdp, UNDO);
  fail_per_node:
        init_per_node(sdp, UNDO);
  fail_inodes:
@@@ -1340,18 -1366,8 +1340,18 @@@ static struct dentry *gfs2_mount(struc
        if (IS_ERR(s))
                goto error_bdev;
  
 -      if (s->s_root)
 +      if (s->s_root) {
 +              /*
 +               * s_umount nests inside bd_mutex during
 +               * __invalidate_device().  blkdev_put() acquires
 +               * bd_mutex and can't be called under s_umount.  Drop
 +               * s_umount temporarily.  This is safe as we're
 +               * holding an active reference.
 +               */
 +              up_write(&s->s_umount);
                blkdev_put(bdev, mode);
 +              down_write(&s->s_umount);
 +      }
  
        memset(&args, 0, sizeof(args));
        args.ar_quota = GFS2_QUOTA_DEFAULT;
diff --combined fs/xfs/xfs_aops.c
index a26739451b535cf02a8016c423583f76a26bac72,1b19b9cd692ad8ff4d73259f7353b1e6a86477ad..db2cfb067d0b1ea88f8b64875ceb174d3ae582d2
@@@ -407,7 -407,7 +407,7 @@@ xfs_alloc_ioend_bio
        struct bio              *bio = bio_alloc(GFP_NOIO, nvecs);
  
        ASSERT(bio->bi_private == NULL);
-       bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+       bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
        return bio;
  }
@@@ -1217,7 -1217,7 +1217,7 @@@ __xfs_get_blocks
                lockmode = XFS_ILOCK_EXCL;
                xfs_ilock(ip, lockmode);
        } else {
 -              lockmode = xfs_ilock_map_shared(ip);
 +              lockmode = xfs_ilock_data_map_shared(ip);
        }
  
        ASSERT(offset <= mp->m_super->s_maxbytes);
diff --combined fs/xfs/xfs_buf.c
index 51757113a822abc57334bbc25f0251671fdd3266,2a941ab623cb1b32498e9aadfcddda63084df1f8..9c061ef2b0d973c913a1baaee4a43bc27523b244
@@@ -445,8 -445,8 +445,8 @@@ _xfs_buf_find
        numbytes = BBTOB(numblks);
  
        /* Check for IOs smaller than the sector size / not sector aligned */
 -      ASSERT(!(numbytes < (1 << btp->bt_sshift)));
 -      ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
 +      ASSERT(!(numbytes < btp->bt_meta_sectorsize));
 +      ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
  
        /*
         * Corrupted block numbers can get through to here, unfortunately, so we
@@@ -1240,7 -1240,7 +1240,7 @@@ next_chunk
  
        bio = bio_alloc(GFP_NOIO, nr_pages);
        bio->bi_bdev = bp->b_target->bt_bdev;
-       bio->bi_sector = sector;
+       bio->bi_iter.bi_sector = sector;
        bio->bi_end_io = xfs_buf_bio_end_io;
        bio->bi_private = bp;
  
                total_nr_pages--;
        }
  
-       if (likely(bio->bi_size)) {
+       if (likely(bio->bi_iter.bi_size)) {
                if (xfs_buf_is_vmapped(bp)) {
                        flush_kernel_vmap_range(bp->b_addr,
                                                xfs_buf_vmap_len(bp));
@@@ -1593,15 -1593,16 +1593,15 @@@ xfs_free_buftarg
        kmem_free(btp);
  }
  
 -STATIC int
 -xfs_setsize_buftarg_flags(
 +int
 +xfs_setsize_buftarg(
        xfs_buftarg_t           *btp,
        unsigned int            blocksize,
 -      unsigned int            sectorsize,
 -      int                     verbose)
 +      unsigned int            sectorsize)
  {
 -      btp->bt_bsize = blocksize;
 -      btp->bt_sshift = ffs(sectorsize) - 1;
 -      btp->bt_smask = sectorsize - 1;
 +      /* Set up metadata sector size info */
 +      btp->bt_meta_sectorsize = sectorsize;
 +      btp->bt_meta_sectormask = sectorsize - 1;
  
        if (set_blocksize(btp->bt_bdev, sectorsize)) {
                char name[BDEVNAME_SIZE];
                return EINVAL;
        }
  
 +      /* Set up device logical sector size mask */
 +      btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
 +      btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
 +
        return 0;
  }
  
  /*
 - *    When allocating the initial buffer target we have not yet
 - *    read in the superblock, so don't know what sized sectors
 - *    are being used at this early stage.  Play safe.
 + * When allocating the initial buffer target we have not yet
 + * read in the superblock, so don't know what sized sectors
 + * are being used at this early stage.  Play safe.
   */
  STATIC int
  xfs_setsize_buftarg_early(
        xfs_buftarg_t           *btp,
        struct block_device     *bdev)
  {
 -      return xfs_setsize_buftarg_flags(btp,
 -                      PAGE_SIZE, bdev_logical_block_size(bdev), 0);
 -}
 -
 -int
 -xfs_setsize_buftarg(
 -      xfs_buftarg_t           *btp,
 -      unsigned int            blocksize,
 -      unsigned int            sectorsize)
 -{
 -      return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
 +      return xfs_setsize_buftarg(btp, PAGE_SIZE,
 +                                 bdev_logical_block_size(bdev));
  }
  
  xfs_buftarg_t *
index 20ee8b63a96848ad1bc63fb29ce97c853502d700,091fdb600d55bbde721ef15d06cbb5494e322ebc..d21f2dba07314c48dce2414c4be23d2191180c81
@@@ -1,6 -1,7 +1,7 @@@
  #ifndef __FS_CEPH_MESSENGER_H
  #define __FS_CEPH_MESSENGER_H
  
+ #include <linux/blk_types.h>
  #include <linux/kref.h>
  #include <linux/mutex.h>
  #include <linux/net.h>
@@@ -60,8 -61,8 +61,8 @@@ struct ceph_messenger 
        u32 global_seq;
        spinlock_t global_seq_lock;
  
 -      u32 supported_features;
 -      u32 required_features;
 +      u64 supported_features;
 +      u64 required_features;
  };
  
  enum ceph_msg_data_type {
@@@ -119,8 -120,7 +120,7 @@@ struct ceph_msg_data_cursor 
  #ifdef CONFIG_BLOCK
                struct {                                /* bio */
                        struct bio      *bio;           /* bio from list */
-                       unsigned int    vector_index;   /* vector from bio */
-                       unsigned int    vector_offset;  /* bytes from vector */
+                       struct bvec_iter bvec_iter;
                };
  #endif /* CONFIG_BLOCK */
                struct {                                /* pages */
@@@ -154,9 -154,10 +154,9 @@@ struct ceph_msg 
        struct list_head list_head;     /* links for connection lists */
  
        struct kref kref;
 -      bool front_is_vmalloc;
        bool more_to_follow;
        bool needs_out_seq;
 -      int front_max;
 +      int front_alloc_len;
        unsigned long ack_stamp;        /* tx: when we were acked */
  
        struct ceph_msgpool *pool;
@@@ -191,7 -192,7 +191,7 @@@ struct ceph_connection 
  
        struct ceph_entity_name peer_name; /* peer name */
  
 -      unsigned peer_features;
 +      u64 peer_features;
        u32 connect_seq;      /* identify the most recent connection
                                 attempt for this connection, client */
        u32 peer_global_seq;  /* peer's global seq for this connection */
@@@ -255,8 -256,8 +255,8 @@@ extern void ceph_msgr_flush(void)
  
  extern void ceph_messenger_init(struct ceph_messenger *msgr,
                        struct ceph_entity_addr *myaddr,
 -                      u32 supported_features,
 -                      u32 required_features,
 +                      u64 supported_features,
 +                      u64 required_features,
                        bool nocrc);
  
  extern void ceph_con_init(struct ceph_connection *con, void *private,
index 3b9f28dfc8492160940d28e58acf1dc9dc6e5081,bd3ee4fbe7a7fce24a0c8022ea59359dd79892b5..67f38faac589ad52ac5850e5af602799753b8d29
                { META,         "META" },                               \
                { META_FLUSH,   "META_FLUSH" })
  
 -#define show_bio_type(type)                                           \
 -      __print_symbolic(type,                                          \
 -              { READ,         "READ" },                               \
 -              { READA,        "READAHEAD" },                          \
 -              { READ_SYNC,    "READ_SYNC" },                          \
 -              { WRITE,        "WRITE" },                              \
 -              { WRITE_SYNC,   "WRITE_SYNC" },                         \
 -              { WRITE_FLUSH,  "WRITE_FLUSH" },                        \
 -              { WRITE_FUA,    "WRITE_FUA" })
 +#define F2FS_BIO_MASK(t)      (t & (READA | WRITE_FLUSH_FUA))
 +#define F2FS_BIO_EXTRA_MASK(t)        (t & (REQ_META | REQ_PRIO))
 +
 +#define show_bio_type(type)   show_bio_base(type), show_bio_extra(type)
 +
 +#define show_bio_base(type)                                           \
 +      __print_symbolic(F2FS_BIO_MASK(type),                           \
 +              { READ,                 "READ" },                       \
 +              { READA,                "READAHEAD" },                  \
 +              { READ_SYNC,            "READ_SYNC" },                  \
 +              { WRITE,                "WRITE" },                      \
 +              { WRITE_SYNC,           "WRITE_SYNC" },                 \
 +              { WRITE_FLUSH,          "WRITE_FLUSH" },                \
 +              { WRITE_FUA,            "WRITE_FUA" },                  \
 +              { WRITE_FLUSH_FUA,      "WRITE_FLUSH_FUA" })
 +
 +#define show_bio_extra(type)                                          \
 +      __print_symbolic(F2FS_BIO_EXTRA_MASK(type),                     \
 +              { REQ_META,             "(M)" },                        \
 +              { REQ_PRIO,             "(P)" },                        \
 +              { REQ_META | REQ_PRIO,  "(MP)" },                       \
 +              { 0, " \b" })
  
  #define show_data_type(type)                                          \
        __print_symbolic(type,                                          \
@@@ -434,7 -421,7 +434,7 @@@ TRACE_EVENT(f2fs_truncate_partial_nodes
                __entry->err)
  );
  
 -TRACE_EVENT_CONDITION(f2fs_readpage,
 +TRACE_EVENT_CONDITION(f2fs_submit_page_bio,
  
        TP_PROTO(struct page *page, sector_t blkaddr, int type),
  
        ),
  
        TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, "
 -              "blkaddr = 0x%llx, bio_type = %s",
 +              "blkaddr = 0x%llx, bio_type = %s%s",
                show_dev_ino(__entry),
                (unsigned long)__entry->index,
                (unsigned long long)__entry->blkaddr,
@@@ -611,54 -598,36 +611,54 @@@ TRACE_EVENT(f2fs_reserve_new_block
                __entry->ofs_in_node)
  );
  
 -TRACE_EVENT(f2fs_do_submit_bio,
 +DECLARE_EVENT_CLASS(f2fs__submit_bio,
  
 -      TP_PROTO(struct super_block *sb, int btype, bool sync, struct bio *bio),
 +      TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio),
  
 -      TP_ARGS(sb, btype, sync, bio),
 +      TP_ARGS(sb, rw, type, bio),
  
        TP_STRUCT__entry(
                __field(dev_t,  dev)
 -              __field(int,    btype)
 -              __field(bool,   sync)
 +              __field(int,    rw)
 +              __field(int,    type)
                __field(sector_t,       sector)
                __field(unsigned int,   size)
        ),
  
        TP_fast_assign(
                __entry->dev            = sb->s_dev;
 -              __entry->btype          = btype;
 -              __entry->sync           = sync;
 +              __entry->rw             = rw;
 +              __entry->type           = type;
-               __entry->sector         = bio->bi_sector;
-               __entry->size           = bio->bi_size;
+               __entry->sector         = bio->bi_iter.bi_sector;
+               __entry->size           = bio->bi_iter.bi_size;
        ),
  
 -      TP_printk("dev = (%d,%d), type = %s, io = %s, sector = %lld, size = %u",
 +      TP_printk("dev = (%d,%d), %s%s, %s, sector = %lld, size = %u",
                show_dev(__entry),
 -              show_block_type(__entry->btype),
 -              __entry->sync ? "sync" : "no sync",
 +              show_bio_type(__entry->rw),
 +              show_block_type(__entry->type),
                (unsigned long long)__entry->sector,
                __entry->size)
  );
  
 +DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_write_bio,
 +
 +      TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio),
 +
 +      TP_ARGS(sb, rw, type, bio),
 +
 +      TP_CONDITION(bio)
 +);
 +
 +DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_read_bio,
 +
 +      TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio),
 +
 +      TP_ARGS(sb, rw, type, bio),
 +
 +      TP_CONDITION(bio)
 +);
 +
  DECLARE_EVENT_CLASS(f2fs__page,
  
        TP_PROTO(struct page *page, int type),
@@@ -705,16 -674,15 +705,16 @@@ DEFINE_EVENT(f2fs__page, f2fs_vm_page_m
        TP_ARGS(page, type)
  );
  
 -TRACE_EVENT(f2fs_submit_write_page,
 +TRACE_EVENT(f2fs_submit_page_mbio,
  
 -      TP_PROTO(struct page *page, block_t blk_addr, int type),
 +      TP_PROTO(struct page *page, int rw, int type, block_t blk_addr),
  
 -      TP_ARGS(page, blk_addr, type),
 +      TP_ARGS(page, rw, type, blk_addr),
  
        TP_STRUCT__entry(
                __field(dev_t,  dev)
                __field(ino_t,  ino)
 +              __field(int, rw)
                __field(int, type)
                __field(pgoff_t, index)
                __field(block_t, block)
        TP_fast_assign(
                __entry->dev    = page->mapping->host->i_sb->s_dev;
                __entry->ino    = page->mapping->host->i_ino;
 +              __entry->rw     = rw;
                __entry->type   = type;
                __entry->index  = page->index;
                __entry->block  = blk_addr;
        ),
  
 -      TP_printk("dev = (%d,%d), ino = %lu, %s, index = %lu, blkaddr = 0x%llx",
 +      TP_printk("dev = (%d,%d), ino = %lu, %s%s, %s, index = %lu, blkaddr = 0x%llx",
                show_dev_ino(__entry),
 +              show_bio_type(__entry->rw),
                show_block_type(__entry->type),
                (unsigned long)__entry->index,
                (unsigned long long)__entry->block)
@@@ -761,29 -727,6 +761,29 @@@ TRACE_EVENT(f2fs_write_checkpoint
                __entry->msg)
  );
  
 +TRACE_EVENT(f2fs_issue_discard,
 +
 +      TP_PROTO(struct super_block *sb, block_t blkstart, block_t blklen),
 +
 +      TP_ARGS(sb, blkstart, blklen),
 +
 +      TP_STRUCT__entry(
 +              __field(dev_t,  dev)
 +              __field(block_t, blkstart)
 +              __field(block_t, blklen)
 +      ),
 +
 +      TP_fast_assign(
 +              __entry->dev    = sb->s_dev;
 +              __entry->blkstart = blkstart;
 +              __entry->blklen = blklen;
 +      ),
 +
 +      TP_printk("dev = (%d,%d), blkstart = 0x%llx, blklen = 0x%llx",
 +              show_dev(__entry),
 +              (unsigned long long)__entry->blkstart,
 +              (unsigned long long)__entry->blklen)
 +);
  #endif /* _TRACE_F2FS_H */
  
   /* This part must be outside protection */
diff --combined mm/page_io.c
index 7247be6114ac894523d8273743a4f168ceab3afa,f14eded987fac8276e3e3da5ff11d260a8ba44cf..7c59ef681381bb7afeef2cf5207d269e9a95c1f8
@@@ -31,13 -31,13 +31,13 @@@ static struct bio *get_swap_bio(gfp_t g
  
        bio = bio_alloc(gfp_flags, 1);
        if (bio) {
-               bio->bi_sector = map_swap_page(page, &bio->bi_bdev);
-               bio->bi_sector <<= PAGE_SHIFT - 9;
+               bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
+               bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
                bio->bi_io_vec[0].bv_page = page;
                bio->bi_io_vec[0].bv_len = PAGE_SIZE;
                bio->bi_io_vec[0].bv_offset = 0;
                bio->bi_vcnt = 1;
-               bio->bi_size = PAGE_SIZE;
+               bio->bi_iter.bi_size = PAGE_SIZE;
                bio->bi_end_io = end_io;
        }
        return bio;
@@@ -62,7 -62,7 +62,7 @@@ void end_swap_bio_write(struct bio *bio
                printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
                                imajor(bio->bi_bdev->bd_inode),
                                iminor(bio->bi_bdev->bd_inode),
-                               (unsigned long long)bio->bi_sector);
+                               (unsigned long long)bio->bi_iter.bi_sector);
                ClearPageReclaim(page);
        }
        end_page_writeback(page);
@@@ -80,7 -80,7 +80,7 @@@ void end_swap_bio_read(struct bio *bio
                printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
                                imajor(bio->bi_bdev->bd_inode),
                                iminor(bio->bi_bdev->bd_inode),
-                               (unsigned long long)bio->bi_sector);
+                               (unsigned long long)bio->bi_iter.bi_sector);
                goto out;
        }
  
@@@ -320,8 -320,8 +320,8 @@@ int swap_readpage(struct page *page
        int ret = 0;
        struct swap_info_struct *sis = page_swap_info(page);
  
 -      VM_BUG_ON(!PageLocked(page));
 -      VM_BUG_ON(PageUptodate(page));
 +      VM_BUG_ON_PAGE(!PageLocked(page), page);
 +      VM_BUG_ON_PAGE(PageUptodate(page), page);
        if (frontswap_load(page) == 0) {
                SetPageUptodate(page);
                unlock_page(page);
diff --combined net/ceph/messenger.c
index 2ed1304d22a7dfed5c8bc9f86d5f0f5cb1b91742,18c039b95c22459f8d669ee2598c411c1b615141..0e478a0f4204b72ed19ae49c349d632cda009e02
@@@ -15,7 -15,6 +15,7 @@@
  #include <linux/dns_resolver.h>
  #include <net/tcp.h>
  
 +#include <linux/ceph/ceph_features.h>
  #include <linux/ceph/libceph.h>
  #include <linux/ceph/messenger.h>
  #include <linux/ceph/decode.h>
@@@ -778,13 -777,12 +778,12 @@@ static void ceph_msg_data_bio_cursor_in
  
        bio = data->bio;
        BUG_ON(!bio);
-       BUG_ON(!bio->bi_vcnt);
  
        cursor->resid = min(length, data->bio_length);
        cursor->bio = bio;
-       cursor->vector_index = 0;
-       cursor->vector_offset = 0;
-       cursor->last_piece = length <= bio->bi_io_vec[0].bv_len;
+       cursor->bvec_iter = bio->bi_iter;
+       cursor->last_piece =
+               cursor->resid <= bio_iter_len(bio, cursor->bvec_iter);
  }
  
  static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
  {
        struct ceph_msg_data *data = cursor->data;
        struct bio *bio;
-       struct bio_vec *bio_vec;
-       unsigned int index;
+       struct bio_vec bio_vec;
  
        BUG_ON(data->type != CEPH_MSG_DATA_BIO);
  
        bio = cursor->bio;
        BUG_ON(!bio);
  
-       index = cursor->vector_index;
-       BUG_ON(index >= (unsigned int) bio->bi_vcnt);
+       bio_vec = bio_iter_iovec(bio, cursor->bvec_iter);
  
-       bio_vec = &bio->bi_io_vec[index];
-       BUG_ON(cursor->vector_offset >= bio_vec->bv_len);
-       *page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset);
+       *page_offset = (size_t) bio_vec.bv_offset;
        BUG_ON(*page_offset >= PAGE_SIZE);
        if (cursor->last_piece) /* pagelist offset is always 0 */
                *length = cursor->resid;
        else
-               *length = (size_t) (bio_vec->bv_len - cursor->vector_offset);
+               *length = (size_t) bio_vec.bv_len;
        BUG_ON(*length > cursor->resid);
        BUG_ON(*page_offset + *length > PAGE_SIZE);
  
-       return bio_vec->bv_page;
+       return bio_vec.bv_page;
  }
  
  static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
                                        size_t bytes)
  {
        struct bio *bio;
-       struct bio_vec *bio_vec;
-       unsigned int index;
+       struct bio_vec bio_vec;
  
        BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO);
  
        bio = cursor->bio;
        BUG_ON(!bio);
  
-       index = cursor->vector_index;
-       BUG_ON(index >= (unsigned int) bio->bi_vcnt);
-       bio_vec = &bio->bi_io_vec[index];
+       bio_vec = bio_iter_iovec(bio, cursor->bvec_iter);
  
        /* Advance the cursor offset */
  
        BUG_ON(cursor->resid < bytes);
        cursor->resid -= bytes;
-       cursor->vector_offset += bytes;
-       if (cursor->vector_offset < bio_vec->bv_len)
+       bio_advance_iter(bio, &cursor->bvec_iter, bytes);
+       if (bytes < bio_vec.bv_len)
                return false;   /* more bytes to process in this segment */
-       BUG_ON(cursor->vector_offset != bio_vec->bv_len);
  
        /* Move on to the next segment, and possibly the next bio */
  
-       if (++index == (unsigned int) bio->bi_vcnt) {
+       if (!cursor->bvec_iter.bi_size) {
                bio = bio->bi_next;
-               index = 0;
+               cursor->bvec_iter = bio->bi_iter;
        }
        cursor->bio = bio;
-       cursor->vector_index = index;
-       cursor->vector_offset = 0;
  
        if (!cursor->last_piece) {
                BUG_ON(!cursor->resid);
                BUG_ON(!bio);
                /* A short read is OK, so use <= rather than == */
-               if (cursor->resid <= bio->bi_io_vec[index].bv_len)
+               if (cursor->resid <= bio_iter_len(bio, cursor->bvec_iter))
                        cursor->last_piece = true;
        }
  
@@@ -1866,9 -1856,7 +1857,9 @@@ int ceph_parse_ips(const char *c, cons
                                port = (port * 10) + (*p - '0');
                                p++;
                        }
 -                      if (port > 65535 || port == 0)
 +                      if (port == 0)
 +                              port = CEPH_MON_PORT;
 +                      else if (port > 65535)
                                goto bad;
                } else {
                        port = CEPH_MON_PORT;
@@@ -1948,8 -1936,7 +1939,8 @@@ static int process_connect(struct ceph_
  {
        u64 sup_feat = con->msgr->supported_features;
        u64 req_feat = con->msgr->required_features;
 -      u64 server_feat = le64_to_cpu(con->in_reply.features);
 +      u64 server_feat = ceph_sanitize_features(
 +                              le64_to_cpu(con->in_reply.features));
        int ret;
  
        dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@@ -2857,8 -2844,8 +2848,8 @@@ static void con_fault(struct ceph_conne
   */
  void ceph_messenger_init(struct ceph_messenger *msgr,
                        struct ceph_entity_addr *myaddr,
 -                      u32 supported_features,
 -                      u32 required_features,
 +                      u64 supported_features,
 +                      u64 required_features,
                        bool nocrc)
  {
        msgr->supported_features = supported_features;
@@@ -3130,8 -3117,15 +3121,8 @@@ struct ceph_msg *ceph_msg_new(int type
        INIT_LIST_HEAD(&m->data);
  
        /* front */
 -      m->front_max = front_len;
        if (front_len) {
 -              if (front_len > PAGE_CACHE_SIZE) {
 -                      m->front.iov_base = __vmalloc(front_len, flags,
 -                                                    PAGE_KERNEL);
 -                      m->front_is_vmalloc = true;
 -              } else {
 -                      m->front.iov_base = kmalloc(front_len, flags);
 -              }
 +              m->front.iov_base = ceph_kvmalloc(front_len, flags);
                if (m->front.iov_base == NULL) {
                        dout("ceph_msg_new can't allocate %d bytes\n",
                             front_len);
        } else {
                m->front.iov_base = NULL;
        }
 -      m->front.iov_len = front_len;
 +      m->front_alloc_len = m->front.iov_len = front_len;
  
        dout("ceph_msg_new %p front %d\n", m, front_len);
        return m;
@@@ -3253,7 -3247,10 +3244,7 @@@ static int ceph_con_in_msg_alloc(struc
  void ceph_msg_kfree(struct ceph_msg *m)
  {
        dout("msg_kfree %p\n", m);
 -      if (m->front_is_vmalloc)
 -              vfree(m->front.iov_base);
 -      else
 -              kfree(m->front.iov_base);
 +      ceph_kvfree(m->front.iov_base);
        kmem_cache_free(ceph_msg_cache, m);
  }
  
@@@ -3295,8 -3292,8 +3286,8 @@@ EXPORT_SYMBOL(ceph_msg_last_put)
  
  void ceph_msg_dump(struct ceph_msg *msg)
  {
 -      pr_debug("msg_dump %p (front_max %d length %zd)\n", msg,
 -               msg->front_max, msg->data_length);
 +      pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg,
 +               msg->front_alloc_len, msg->data_length);
        print_hex_dump(KERN_DEBUG, "header: ",
                       DUMP_PREFIX_OFFSET, 16, 1,
                       &msg->hdr, sizeof(msg->hdr), true);