Merge branch 'for-3.14/core' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 30 Jan 2014 19:19:05 +0000 (11:19 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 30 Jan 2014 19:19:05 +0000 (11:19 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 30 Jan 2014 19:19:05 +0000 (11:19 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 30 Jan 2014 19:19:05 +0000 (11:19 -0800)
diff --combined block/blk-throttle.c

index a760857e6b62609dde239ad74aebe2b5ac2ebaac,20f82003777511798659a467666c3a8f6d4b8cbb..1474c3ab7e72cb85698ffe8bb3687df66729281b
--- 1/block/blk-throttle.c
--- 2/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@@ -877,14 -877,14 +877,14 @@@ static bool tg_with_in_bps_limit(struc
         do_div(tmp, HZ);
         bytes_allowed = tmp;
   
-       if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
+       if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
                 if (wait)
                         *wait = 0;
                 return 1;
         }
   
         /* Calc approx time to dispatch */
-       extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
+       extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
         jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
   
         if (!jiffy_wait)
@@@ -987,7 -987,7 +987,7 @@@ static void throtl_charge_bio(struct th
         bool rw = bio_data_dir(bio);
   
         /* Charge the bio to the group */
-       tg->bytes_disp[rw] += bio->bi_size;
+       tg->bytes_disp[rw] += bio->bi_iter.bi_size;
         tg->io_disp[rw]++;
   
         /*
@@@ -1003,8 -1003,8 +1003,8 @@@
          */
         if (!(bio->bi_rw & REQ_THROTTLED)) {
                 bio->bi_rw |= REQ_THROTTLED;
-               throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size,
-                                            bio->bi_rw);
+               throtl_update_dispatch_stats(tg_to_blkg(tg),
+                                            bio->bi_iter.bi_size, bio->bi_rw);
         }
   }
   
@@@ -1303,10 -1303,13 +1303,10 @@@ static u64 tg_prfill_cpu_rwstat(struct 
         return __blkg_prfill_rwstat(sf, pd, &rwstat);
   }
   
- -static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css,
- -                             struct cftype *cft, struct seq_file *sf)
+ +static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
   {
- -      struct blkcg *blkcg = css_to_blkcg(css);
- -
- -      blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
- -                        cft->private, true);
+ +      blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
+ +                        &blkcg_policy_throtl, seq_cft(sf)->private, true);
         return 0;
   }
   
@@@ -1332,17 -1335,19 +1332,17 @@@ static u64 tg_prfill_conf_uint(struct s
         return __blkg_prfill_u64(sf, pd, v);
   }
   
- -static int tg_print_conf_u64(struct cgroup_subsys_state *css,
- -                           struct cftype *cft, struct seq_file *sf)
+ +static int tg_print_conf_u64(struct seq_file *sf, void *v)
   {
- -      blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64,
- -                        &blkcg_policy_throtl, cft->private, false);
+ +      blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
+ +                        &blkcg_policy_throtl, seq_cft(sf)->private, false);
         return 0;
   }
   
- -static int tg_print_conf_uint(struct cgroup_subsys_state *css,
- -                            struct cftype *cft, struct seq_file *sf)
+ +static int tg_print_conf_uint(struct seq_file *sf, void *v)
   {
- -      blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint,
- -                        &blkcg_policy_throtl, cft->private, false);
+ +      blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
+ +                        &blkcg_policy_throtl, seq_cft(sf)->private, false);
         return 0;
   }
   
@@@ -1423,40 -1428,40 +1423,40 @@@ static struct cftype throtl_files[] = 
         {
                 .name = "throttle.read_bps_device",
                 .private = offsetof(struct throtl_grp, bps[READ]),
- -              .read_seq_string = tg_print_conf_u64,
+ +              .seq_show = tg_print_conf_u64,
                 .write_string = tg_set_conf_u64,
                 .max_write_len = 256,
         },
         {
                 .name = "throttle.write_bps_device",
                 .private = offsetof(struct throtl_grp, bps[WRITE]),
- -              .read_seq_string = tg_print_conf_u64,
+ +              .seq_show = tg_print_conf_u64,
                 .write_string = tg_set_conf_u64,
                 .max_write_len = 256,
         },
         {
                 .name = "throttle.read_iops_device",
                 .private = offsetof(struct throtl_grp, iops[READ]),
- -              .read_seq_string = tg_print_conf_uint,
+ +              .seq_show = tg_print_conf_uint,
                 .write_string = tg_set_conf_uint,
                 .max_write_len = 256,
         },
         {
                 .name = "throttle.write_iops_device",
                 .private = offsetof(struct throtl_grp, iops[WRITE]),
- -              .read_seq_string = tg_print_conf_uint,
+ +              .seq_show = tg_print_conf_uint,
                 .write_string = tg_set_conf_uint,
                 .max_write_len = 256,
         },
         {
                 .name = "throttle.io_service_bytes",
                 .private = offsetof(struct tg_stats_cpu, service_bytes),
- -              .read_seq_string = tg_print_cpu_rwstat,
+ +              .seq_show = tg_print_cpu_rwstat,
         },
         {
                 .name = "throttle.io_serviced",
                 .private = offsetof(struct tg_stats_cpu, serviced),
- -              .read_seq_string = tg_print_cpu_rwstat,
+ +              .seq_show = tg_print_cpu_rwstat,
         },
         { }     /* terminate */
   };
@@@ -1503,7 -1508,7 +1503,7 @@@ bool blk_throtl_bio(struct request_queu
         if (tg) {
                 if (!tg->has_rules[rw]) {
                         throtl_update_dispatch_stats(tg_to_blkg(tg),
-                                                    bio->bi_size, bio->bi_rw);
+                                       bio->bi_iter.bi_size, bio->bi_rw);
                         goto out_unlock_rcu;
                 }
         }
@@@ -1559,7 -1564,7 +1559,7 @@@
         /* out-of-limit, queue to @tg */
         throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
                    rw == READ ? 'R' : 'W',
-                  tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
+                  tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw],
                    tg->io_disp[rw], tg->iops[rw],
                    sq->nr_queued[READ], sq->nr_queued[WRITE]);
   
diff --combined drivers/block/rbd.c

index 16cab6635163797da9414a27cb8634356d5cd999,3624368b910dd30fe841cb42e2f13a8ec5109bc8..b365e0dfccb66f7c256a9d07d7fd976fba17ae95
--- 1/drivers/block/rbd.c
--- 2/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@@ -41,7 -41,6 +41,7 @@@
   #include <linux/fs.h>
   #include <linux/blkdev.h>
   #include <linux/slab.h>
+ +#include <linux/idr.h>
   
   #include "rbd_types.h"
   
@@@ -90,9 -89,9 +90,9 @@@ static int atomic_dec_return_safe(atomi
   }
   
   #define RBD_DRV_NAME "rbd"
- -#define RBD_DRV_NAME_LONG "rbd (rados block device)"
   
- -#define RBD_MINORS_PER_MAJOR  256             /* max minors per blkdev */
+ +#define RBD_MINORS_PER_MAJOR          256
+ +#define RBD_SINGLE_MAJOR_PART_SHIFT   4
   
   #define RBD_SNAP_DEV_NAME_PREFIX      "snap_"
   #define RBD_MAX_SNAP_NAME_LEN \
@@@ -324,7 -323,6 +324,7 @@@ struct rbd_device 
         int                     dev_id;         /* blkdev unique id */
   
         int                     major;          /* blkdev assigned major */
+ +      int                     minor;
         struct gendisk          *disk;          /* blkdev's gendisk and rq */
   
         u32                     image_format;   /* Either 1 or 2 */
@@@ -388,17 -386,6 +388,17 @@@ static struct kmem_cache *rbd_img_reque
   static struct kmem_cache      *rbd_obj_request_cache;
   static struct kmem_cache      *rbd_segment_name_cache;
   
+ +static int rbd_major;
+ +static DEFINE_IDA(rbd_dev_id_ida);
+ +
+ +/*
+ + * Default to false for now, as single-major requires >= 0.75 version of
+ + * userspace rbd utility.
+ + */
+ +static bool single_major = false;
+ +module_param(single_major, bool, S_IRUGO);
+ +MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
+ +
   static int rbd_img_request_submit(struct rbd_img_request *img_request);
   
   static void rbd_dev_device_release(struct device *dev);
@@@ -407,52 -394,18 +407,52 @@@ static ssize_t rbd_add(struct bus_type 
                        size_t count);
   static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
                           size_t count);
+ +static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
+ +                                  size_t count);
+ +static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
+ +                                     size_t count);
   static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
   static void rbd_spec_put(struct rbd_spec *spec);
   
+ +static int rbd_dev_id_to_minor(int dev_id)
+ +{
+ +      return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
+ +}
+ +
+ +static int minor_to_rbd_dev_id(int minor)
+ +{
+ +      return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
+ +}
+ +
   static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
   static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
+ +static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
+ +static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
   
   static struct attribute *rbd_bus_attrs[] = {
         &bus_attr_add.attr,
         &bus_attr_remove.attr,
+ +      &bus_attr_add_single_major.attr,
+ +      &bus_attr_remove_single_major.attr,
         NULL,
   };
- -ATTRIBUTE_GROUPS(rbd_bus);
+ +
+ +static umode_t rbd_bus_is_visible(struct kobject *kobj,
+ +                                struct attribute *attr, int index)
+ +{
+ +      if (!single_major &&
+ +          (attr == &bus_attr_add_single_major.attr ||
+ +           attr == &bus_attr_remove_single_major.attr))
+ +              return 0;
+ +
+ +      return attr->mode;
+ +}
+ +
+ +static const struct attribute_group rbd_bus_group = {
+ +      .attrs = rbd_bus_attrs,
+ +      .is_visible = rbd_bus_is_visible,
+ +};
+ +__ATTRIBUTE_GROUPS(rbd_bus);
   
   static struct bus_type rbd_bus_type = {
         .name           = "rbd",
@@@ -1088,9 -1041,9 +1088,9 @@@ static const char *rbd_segment_name(str
         name_format = "%s.%012llx";
         if (rbd_dev->image_format == 2)
                 name_format = "%s.%016llx";
- -      ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format,
+ +      ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
                         rbd_dev->header.object_prefix, segment);
- -      if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
+ +      if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
                 pr_err("error formatting segment name for #%llu (%d)\n",
                         segment, ret);
                 kfree(name);
@@@ -1156,23 -1109,23 +1156,23 @@@ static void bio_chain_put(struct bio *c
    */
   static void zero_bio_chain(struct bio *chain, int start_ofs)
   {
-       struct bio_vec *bv;
+       struct bio_vec bv;
+       struct bvec_iter iter;
         unsigned long flags;
         void *buf;
-       int i;
         int pos = 0;
   
         while (chain) {
-               bio_for_each_segment(bv, chain, i) {
-                       if (pos + bv->bv_len > start_ofs) {
+               bio_for_each_segment(bv, chain, iter) {
+                       if (pos + bv.bv_len > start_ofs) {
                                 int remainder = max(start_ofs - pos, 0);
-                               buf = bvec_kmap_irq(bv, &flags);
+                               buf = bvec_kmap_irq(&bv, &flags);
                                 memset(buf + remainder, 0,
-                                      bv->bv_len - remainder);
-                               flush_dcache_page(bv->bv_page);
+                                      bv.bv_len - remainder);
+                               flush_dcache_page(bv.bv_page);
                                 bvec_kunmap_irq(buf, &flags);
                         }
-                       pos += bv->bv_len;
+                       pos += bv.bv_len;
                 }
   
                 chain = chain->bi_next;
@@@ -1220,74 -1173,14 +1220,14 @@@ static struct bio *bio_clone_range(stru
                                         unsigned int len,
                                         gfp_t gfpmask)
   {
-       struct bio_vec *bv;
-       unsigned int resid;
-       unsigned short idx;
-       unsigned int voff;
-       unsigned short end_idx;
-       unsigned short vcnt;
         struct bio *bio;
   
-       /* Handle the easy case for the caller */
- 
-       if (!offset && len == bio_src->bi_size)
-               return bio_clone(bio_src, gfpmask);
- 
-       if (WARN_ON_ONCE(!len))
-               return NULL;
-       if (WARN_ON_ONCE(len > bio_src->bi_size))
-               return NULL;
-       if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
-               return NULL;
- 
-       /* Find first affected segment... */
- 
-       resid = offset;
-       bio_for_each_segment(bv, bio_src, idx) {
-               if (resid < bv->bv_len)
-                       break;
-               resid -= bv->bv_len;
-       }
-       voff = resid;
- 
-       /* ...and the last affected segment */
- 
-       resid += len;
-       __bio_for_each_segment(bv, bio_src, end_idx, idx) {
-               if (resid <= bv->bv_len)
-                       break;
-               resid -= bv->bv_len;
-       }
-       vcnt = end_idx - idx + 1;
- 
-       /* Build the clone */
- 
-       bio = bio_alloc(gfpmask, (unsigned int) vcnt);
+       bio = bio_clone(bio_src, gfpmask);
         if (!bio)
                 return NULL;    /* ENOMEM */
   
-       bio->bi_bdev = bio_src->bi_bdev;
-       bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
-       bio->bi_rw = bio_src->bi_rw;
-       bio->bi_flags |= 1 << BIO_CLONED;
- 
-       /*
-        * Copy over our part of the bio_vec, then update the first
-        * and last (or only) entries.
-        */
-       memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
-                       vcnt * sizeof (struct bio_vec));
-       bio->bi_io_vec[0].bv_offset += voff;
-       if (vcnt > 1) {
-               bio->bi_io_vec[0].bv_len -= voff;
-               bio->bi_io_vec[vcnt - 1].bv_len = resid;
-       } else {
-               bio->bi_io_vec[0].bv_len = len;
-       }
- 
-       bio->bi_vcnt = vcnt;
-       bio->bi_size = len;
-       bio->bi_idx = 0;
+       bio_advance(bio, offset);
+       bio->bi_iter.bi_size = len;
   
         return bio;
   }
@@@ -1318,7 -1211,7 +1258,7 @@@ static struct bio *bio_chain_clone_rang
   
         /* Build up a chain of clone bios up to the limit */
   
-       if (!bi || off >= bi->bi_size || !len)
+       if (!bi || off >= bi->bi_iter.bi_size || !len)
                 return NULL;            /* Nothing to clone */
   
         end = &chain;
@@@ -1330,7 -1223,7 +1270,7 @@@
                         rbd_warn(NULL, "bio_chain exhausted with %u left", len);
                         goto out_err;   /* EINVAL; ran out of bio's */
                 }
-               bi_size = min_t(unsigned int, bi->bi_size - off, len);
+               bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
                 bio = bio_clone_range(bi, off, bi_size, gfpmask);
                 if (!bio)
                         goto out_err;   /* ENOMEM */
@@@ -1339,7 -1232,7 +1279,7 @@@
                 end = &bio->bi_next;
   
                 off += bi_size;
-               if (off == bi->bi_size) {
+               if (off == bi->bi_iter.bi_size) {
                         bi = bi->bi_next;
                         off = 0;
                 }
@@@ -1808,8 -1701,11 +1748,8 @@@ static struct ceph_osd_request *rbd_osd
         osd_req->r_callback = rbd_osd_req_callback;
         osd_req->r_priv = obj_request;
   
- -      osd_req->r_oid_len = strlen(obj_request->object_name);
- -      rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
- -      memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
- -
- -      osd_req->r_file_layout = rbd_dev->layout;       /* struct */
+ +      osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+ +      ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
   
         return osd_req;
   }
@@@ -1846,8 -1742,11 +1786,8 @@@ rbd_osd_req_create_copyup(struct rbd_ob
         osd_req->r_callback = rbd_osd_req_callback;
         osd_req->r_priv = obj_request;
   
- -      osd_req->r_oid_len = strlen(obj_request->object_name);
- -      rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
- -      memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
- -
- -      osd_req->r_file_layout = rbd_dev->layout;       /* struct */
+ +      osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+ +      ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
   
         return osd_req;
   }
@@@ -2227,7 -2126,8 +2167,8 @@@ static int rbd_img_request_fill(struct 
   
         if (type == OBJ_REQUEST_BIO) {
                 bio_list = data_desc;
-               rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
+               rbd_assert(img_offset ==
+                          bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
         } else {
                 rbd_assert(type == OBJ_REQUEST_PAGES);
                 pages = data_desc;
@@@ -2907,7 -2807,7 +2848,7 @@@ static void rbd_watch_cb(u64 ver, u64 n
    * Request sync osd watch/unwatch.  The value of "start" determines
    * whether a watch request is being initiated or torn down.
    */
- -static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
+ +static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
   {
         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
         struct rbd_obj_request *obj_request;
@@@ -2982,22 -2882,6 +2923,22 @@@ out_cancel
         return ret;
   }
   
+ +static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
+ +{
+ +      return __rbd_dev_header_watch_sync(rbd_dev, true);
+ +}
+ +
+ +static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
+ +{
+ +      int ret;
+ +
+ +      ret = __rbd_dev_header_watch_sync(rbd_dev, false);
+ +      if (ret) {
+ +              rbd_warn(rbd_dev, "unable to tear down watch request: %d\n",
+ +                       ret);
+ +      }
+ +}
+ +
   /*
    * Synchronous osd object method call.  Returns the number of bytes
    * returned in the outbound buffer, or a negative error code.
@@@ -3445,18 -3329,14 +3386,18 @@@ static int rbd_init_disk(struct rbd_dev
         u64 segment_size;
   
         /* create gendisk info */
- -      disk = alloc_disk(RBD_MINORS_PER_MAJOR);
+ +      disk = alloc_disk(single_major ?
+ +                        (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
+ +                        RBD_MINORS_PER_MAJOR);
         if (!disk)
                 return -ENOMEM;
   
         snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
                  rbd_dev->dev_id);
         disk->major = rbd_dev->major;
- -      disk->first_minor = 0;
+ +      disk->first_minor = rbd_dev->minor;
+ +      if (single_major)
+ +              disk->flags |= GENHD_FL_EXT_DEVT;
         disk->fops = &rbd_bd_ops;
         disk->private_data = rbd_dev;
   
@@@ -3528,14 -3408,7 +3469,14 @@@ static ssize_t rbd_major_show(struct de
                 return sprintf(buf, "%d\n", rbd_dev->major);
   
         return sprintf(buf, "(none)\n");
+ +}
   
+ +static ssize_t rbd_minor_show(struct device *dev,
+ +                            struct device_attribute *attr, char *buf)
+ +{
+ +      struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+ +
+ +      return sprintf(buf, "%d\n", rbd_dev->minor);
   }
   
   static ssize_t rbd_client_id_show(struct device *dev,
@@@ -3657,7 -3530,6 +3598,7 @@@ static ssize_t rbd_image_refresh(struc
   static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
   static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
   static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
+ +static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
   static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
   static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
   static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
@@@ -3671,7 -3543,6 +3612,7 @@@ static struct attribute *rbd_attrs[] = 
         &dev_attr_size.attr,
         &dev_attr_features.attr,
         &dev_attr_major.attr,
+ +      &dev_attr_minor.attr,
         &dev_attr_client_id.attr,
         &dev_attr_pool.attr,
         &dev_attr_pool_id.attr,
@@@ -4442,29 -4313,21 +4383,29 @@@ static void rbd_bus_del_dev(struct rbd_
         device_unregister(&rbd_dev->dev);
   }
   
- -static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
- -
   /*
    * Get a unique rbd identifier for the given new rbd_dev, and add
- - * the rbd_dev to the global list.  The minimum rbd id is 1.
+ + * the rbd_dev to the global list.
    */
- -static void rbd_dev_id_get(struct rbd_device *rbd_dev)
+ +static int rbd_dev_id_get(struct rbd_device *rbd_dev)
   {
- -      rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
+ +      int new_dev_id;
+ +
+ +      new_dev_id = ida_simple_get(&rbd_dev_id_ida,
+ +                                  0, minor_to_rbd_dev_id(1 << MINORBITS),
+ +                                  GFP_KERNEL);
+ +      if (new_dev_id < 0)
+ +              return new_dev_id;
+ +
+ +      rbd_dev->dev_id = new_dev_id;
   
         spin_lock(&rbd_dev_list_lock);
         list_add_tail(&rbd_dev->node, &rbd_dev_list);
         spin_unlock(&rbd_dev_list_lock);
- -      dout("rbd_dev %p given dev id %llu\n", rbd_dev,
- -              (unsigned long long) rbd_dev->dev_id);
+ +
+ +      dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id);
+ +
+ +      return 0;
   }
   
   /*
@@@ -4473,13 -4336,49 +4414,13 @@@
    */
   static void rbd_dev_id_put(struct rbd_device *rbd_dev)
   {
- -      struct list_head *tmp;
- -      int rbd_id = rbd_dev->dev_id;
- -      int max_id;
- -
- -      rbd_assert(rbd_id > 0);
- -
- -      dout("rbd_dev %p released dev id %llu\n", rbd_dev,
- -              (unsigned long long) rbd_dev->dev_id);
         spin_lock(&rbd_dev_list_lock);
         list_del_init(&rbd_dev->node);
- -
- -      /*
- -       * If the id being "put" is not the current maximum, there
- -       * is nothing special we need to do.
- -       */
- -      if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
- -              spin_unlock(&rbd_dev_list_lock);
- -              return;
- -      }
- -
- -      /*
- -       * We need to update the current maximum id.  Search the
- -       * list to find out what it is.  We're more likely to find
- -       * the maximum at the end, so search the list backward.
- -       */
- -      max_id = 0;
- -      list_for_each_prev(tmp, &rbd_dev_list) {
- -              struct rbd_device *rbd_dev;
- -
- -              rbd_dev = list_entry(tmp, struct rbd_device, node);
- -              if (rbd_dev->dev_id > max_id)
- -                      max_id = rbd_dev->dev_id;
- -      }
         spin_unlock(&rbd_dev_list_lock);
   
- -      /*
- -       * The max id could have been updated by rbd_dev_id_get(), in
- -       * which case it now accurately reflects the new maximum.
- -       * Be careful not to overwrite the maximum value in that
- -       * case.
- -       */
- -      atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
- -      dout("  max dev id has been reset\n");
+ +      ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
+ +
+ +      dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id);
   }
   
   /*
@@@ -4902,29 -4801,20 +4843,29 @@@ static int rbd_dev_device_setup(struct 
   {
         int ret;
   
- -      /* generate unique id: find highest unique id, add one */
- -      rbd_dev_id_get(rbd_dev);
+ +      /* Get an id and fill in device name. */
+ +
+ +      ret = rbd_dev_id_get(rbd_dev);
+ +      if (ret)
+ +              return ret;
   
- -      /* Fill in the device name, now that we have its id. */
         BUILD_BUG_ON(DEV_NAME_LEN
                         < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
         sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
   
- -      /* Get our block major device number. */
+ +      /* Record our major and minor device numbers. */
   
- -      ret = register_blkdev(0, rbd_dev->name);
- -      if (ret < 0)
- -              goto err_out_id;
- -      rbd_dev->major = ret;
+ +      if (!single_major) {
+ +              ret = register_blkdev(0, rbd_dev->name);
+ +              if (ret < 0)
+ +                      goto err_out_id;
+ +
+ +              rbd_dev->major = ret;
+ +              rbd_dev->minor = 0;
+ +      } else {
+ +              rbd_dev->major = rbd_major;
+ +              rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
+ +      }
   
         /* Set up the blkdev mapping. */
   
@@@ -4956,8 -4846,7 +4897,8 @@@ err_out_mapping
   err_out_disk:
         rbd_free_disk(rbd_dev);
   err_out_blkdev:
- -      unregister_blkdev(rbd_dev->major, rbd_dev->name);
+ +      if (!single_major)
+ +              unregister_blkdev(rbd_dev->major, rbd_dev->name);
   err_out_id:
         rbd_dev_id_put(rbd_dev);
         rbd_dev_mapping_clear(rbd_dev);
@@@ -5013,6 -4902,7 +4954,6 @@@ static void rbd_dev_image_release(struc
   static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
   {
         int ret;
- -      int tmp;
   
         /*
          * Get the id from the image id object.  Unless there's an
@@@ -5031,7 -4921,7 +4972,7 @@@
                 goto err_out_format;
   
         if (mapping) {
- -              ret = rbd_dev_header_watch_sync(rbd_dev, true);
+ +              ret = rbd_dev_header_watch_sync(rbd_dev);
                 if (ret)
                         goto out_header_name;
         }
@@@ -5058,8 -4948,12 +4999,8 @@@
   err_out_probe:
         rbd_dev_unprobe(rbd_dev);
   err_out_watch:
- -      if (mapping) {
- -              tmp = rbd_dev_header_watch_sync(rbd_dev, false);
- -              if (tmp)
- -                      rbd_warn(rbd_dev, "unable to tear down "
- -                                      "watch request (%d)\n", tmp);
- -      }
+ +      if (mapping)
+ +              rbd_dev_header_unwatch_sync(rbd_dev);
   out_header_name:
         kfree(rbd_dev->header_name);
         rbd_dev->header_name = NULL;
@@@ -5073,9 -4967,9 +5014,9 @@@ err_out_format
         return ret;
   }
   
- -static ssize_t rbd_add(struct bus_type *bus,
- -                     const char *buf,
- -                     size_t count)
+ +static ssize_t do_rbd_add(struct bus_type *bus,
+ +                        const char *buf,
+ +                        size_t count)
   {
         struct rbd_device *rbd_dev = NULL;
         struct ceph_options *ceph_opts = NULL;
@@@ -5137,12 -5031,6 +5078,12 @@@
   
         rc = rbd_dev_device_setup(rbd_dev);
         if (rc) {
+ +              /*
+ +               * rbd_dev_header_unwatch_sync() can't be moved into
+ +               * rbd_dev_image_release() without refactoring, see
+ +               * commit 1f3ef78861ac.
+ +               */
+ +              rbd_dev_header_unwatch_sync(rbd_dev);
                 rbd_dev_image_release(rbd_dev);
                 goto err_out_module;
         }
@@@ -5163,23 -5051,6 +5104,23 @@@ err_out_module
         return (ssize_t)rc;
   }
   
+ +static ssize_t rbd_add(struct bus_type *bus,
+ +                     const char *buf,
+ +                     size_t count)
+ +{
+ +      if (single_major)
+ +              return -EINVAL;
+ +
+ +      return do_rbd_add(bus, buf, count);
+ +}
+ +
+ +static ssize_t rbd_add_single_major(struct bus_type *bus,
+ +                                  const char *buf,
+ +                                  size_t count)
+ +{
+ +      return do_rbd_add(bus, buf, count);
+ +}
+ +
   static void rbd_dev_device_release(struct device *dev)
   {
         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
@@@ -5187,8 -5058,8 +5128,8 @@@
         rbd_free_disk(rbd_dev);
         clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
         rbd_dev_mapping_clear(rbd_dev);
- -      unregister_blkdev(rbd_dev->major, rbd_dev->name);
- -      rbd_dev->major = 0;
+ +      if (!single_major)
+ +              unregister_blkdev(rbd_dev->major, rbd_dev->name);
         rbd_dev_id_put(rbd_dev);
         rbd_dev_mapping_clear(rbd_dev);
   }
@@@ -5219,9 -5090,9 +5160,9 @@@ static void rbd_dev_remove_parent(struc
         }
   }
   
- -static ssize_t rbd_remove(struct bus_type *bus,
- -                        const char *buf,
- -                        size_t count)
+ +static ssize_t do_rbd_remove(struct bus_type *bus,
+ +                           const char *buf,
+ +                           size_t count)
   {
         struct rbd_device *rbd_dev = NULL;
         struct list_head *tmp;
@@@ -5261,14 -5132,16 +5202,14 @@@
         if (ret < 0 || already)
                 return ret;
   
- -      ret = rbd_dev_header_watch_sync(rbd_dev, false);
- -      if (ret)
- -              rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
- -
+ +      rbd_dev_header_unwatch_sync(rbd_dev);
         /*
          * flush remaining watch callbacks - these must be complete
          * before the osd_client is shutdown
          */
         dout("%s: flushing notifies", __func__);
         ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
+ +
         /*
          * Don't free anything from rbd_dev->disk until after all
          * notifies are completely processed. Otherwise
@@@ -5282,23 -5155,6 +5223,23 @@@
         return count;
   }
   
+ +static ssize_t rbd_remove(struct bus_type *bus,
+ +                        const char *buf,
+ +                        size_t count)
+ +{
+ +      if (single_major)
+ +              return -EINVAL;
+ +
+ +      return do_rbd_remove(bus, buf, count);
+ +}
+ +
+ +static ssize_t rbd_remove_single_major(struct bus_type *bus,
+ +                                     const char *buf,
+ +                                     size_t count)
+ +{
+ +      return do_rbd_remove(bus, buf, count);
+ +}
+ +
   /*
    * create control files in sysfs
    * /sys/bus/rbd/...
@@@ -5344,7 -5200,7 +5285,7 @@@ static int rbd_slab_init(void
   
         rbd_assert(!rbd_segment_name_cache);
         rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
- -                                      MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL);
+ +                                      CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
         if (rbd_segment_name_cache)
                 return 0;
   out_err:
@@@ -5380,45 -5236,24 +5321,45 @@@ static int __init rbd_init(void
   
         if (!libceph_compatible(NULL)) {
                 rbd_warn(NULL, "libceph incompatibility (quitting)");
- -
                 return -EINVAL;
         }
+ +
         rc = rbd_slab_init();
         if (rc)
                 return rc;
+ +
+ +      if (single_major) {
+ +              rbd_major = register_blkdev(0, RBD_DRV_NAME);
+ +              if (rbd_major < 0) {
+ +                      rc = rbd_major;
+ +                      goto err_out_slab;
+ +              }
+ +      }
+ +
         rc = rbd_sysfs_init();
         if (rc)
- -              rbd_slab_exit();
+ +              goto err_out_blkdev;
+ +
+ +      if (single_major)
+ +              pr_info("loaded (major %d)\n", rbd_major);
         else
- -              pr_info("loaded " RBD_DRV_NAME_LONG "\n");
+ +              pr_info("loaded\n");
+ +
+ +      return 0;
   
+ +err_out_blkdev:
+ +      if (single_major)
+ +              unregister_blkdev(rbd_major, RBD_DRV_NAME);
+ +err_out_slab:
+ +      rbd_slab_exit();
         return rc;
   }
   
   static void __exit rbd_exit(void)
   {
         rbd_sysfs_cleanup();
+ +      if (single_major)
+ +              unregister_blkdev(rbd_major, RBD_DRV_NAME);
         rbd_slab_exit();
   }
   
@@@ -5428,8 -5263,9 +5369,8 @@@ module_exit(rbd_exit)
   MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
   MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
   MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
- -MODULE_DESCRIPTION("rados block device");
- -
   /* following authorship retained from original osdblk.c */
   MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
   
+ +MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
   MODULE_LICENSE("GPL");
diff --combined drivers/block/xen-blkfront.c

index f9c43f91f03e5de68bff030b663f094e56fc1f9f,26ad7923e3319c802a4ab252001ecdf8785b8d60..8dcfb54f160302e0e1d91c232387f758b2f8e0f6
--- 1/drivers/block/xen-blkfront.c
--- 2/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@@ -1356,7 -1356,7 +1356,7 @@@ static int blkfront_probe(struct xenbus
                 char *type;
                 int len;
                 /* no unplug has been done: do not hook devices != xen vbds */
- -              if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) {
+ +              if (xen_has_pv_and_legacy_disk_devices()) {
                         int major;
   
                         if (!VDEV_IS_EXTENDED(vdevice))
@@@ -1547,7 -1547,7 +1547,7 @@@ static int blkif_recover(struct blkfron
                         for (i = 0; i < pending; i++) {
                                 offset = (i * segs * PAGE_SIZE) >> 9;
                                 size = min((unsigned int)(segs * PAGE_SIZE) >> 9,
-                                          (unsigned int)(bio->bi_size >> 9) - offset);
+                                          (unsigned int)bio_sectors(bio) - offset);
                                 cloned_bio = bio_clone(bio, GFP_NOIO);
                                 BUG_ON(cloned_bio == NULL);
                                 bio_trim(cloned_bio, offset, size);
@@@ -2079,7 -2079,7 +2079,7 @@@ static int __init xlblk_init(void
         if (!xen_domain())
                 return -ENODEV;
   
- -      if (xen_hvm_domain() && !xen_platform_pci_unplug)
+ +      if (!xen_has_pv_disk_devices())
                 return -ENODEV;
   
         if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
diff --combined drivers/md/bcache/request.c

index 61bcfc21d2a0f4972b581a689fd1c3c929f7bd38,5878cdb3952948d78e029f4ed0a343d7a7807cab..c906571997d7a4ab256188f05f4a8c11ea5928f8
--- 1/drivers/md/bcache/request.c
--- 2/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@@ -163,6 -163,7 +163,6 @@@ static struct cgroup_subsys_state *bcac
   static void bcachecg_destroy(struct cgroup *cgroup)
   {
         struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
- -      free_css_id(&bcache_subsys, &cg->css);
         kfree(cg);
   }
   
@@@ -197,14 -198,14 +197,14 @@@ static bool verify(struct cached_dev *d
   
   static void bio_csum(struct bio *bio, struct bkey *k)
   {
-       struct bio_vec *bv;
+       struct bio_vec bv;
+       struct bvec_iter iter;
         uint64_t csum = 0;
-       int i;
   
-       bio_for_each_segment(bv, bio, i) {
-               void *d = kmap(bv->bv_page) + bv->bv_offset;
-               csum = bch_crc64_update(csum, d, bv->bv_len);
-               kunmap(bv->bv_page);
+       bio_for_each_segment(bv, bio, iter) {
+               void *d = kmap(bv.bv_page) + bv.bv_offset;
+               csum = bch_crc64_update(csum, d, bv.bv_len);
+               kunmap(bv.bv_page);
         }
   
         k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
@@@ -260,7 -261,7 +260,7 @@@ static void bch_data_invalidate(struct 
         struct bio *bio = op->bio;
   
         pr_debug("invalidating %i sectors from %llu",
-                bio_sectors(bio), (uint64_t) bio->bi_sector);
+                bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector);
   
         while (bio_sectors(bio)) {
                 unsigned sectors = min(bio_sectors(bio),
@@@ -269,11 -270,11 +269,11 @@@
                 if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
                         goto out;
   
-               bio->bi_sector  += sectors;
-               bio->bi_size    -= sectors << 9;
+               bio->bi_iter.bi_sector  += sectors;
+               bio->bi_iter.bi_size    -= sectors << 9;
   
                 bch_keylist_add(&op->insert_keys,
-                               &KEY(op->inode, bio->bi_sector, sectors));
+                               &KEY(op->inode, bio->bi_iter.bi_sector, sectors));
         }
   
         op->insert_data_done = true;
@@@ -363,14 -364,14 +363,14 @@@ static void bch_data_insert_start(struc
                 k = op->insert_keys.top;
                 bkey_init(k);
                 SET_KEY_INODE(k, op->inode);
-               SET_KEY_OFFSET(k, bio->bi_sector);
+               SET_KEY_OFFSET(k, bio->bi_iter.bi_sector);
   
                 if (!bch_alloc_sectors(op->c, k, bio_sectors(bio),
                                        op->write_point, op->write_prio,
                                        op->writeback))
                         goto err;
   
-               n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
+               n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split);
   
                 n->bi_end_io    = bch_data_insert_endio;
                 n->bi_private   = cl;
@@@ -521,7 -522,7 +521,7 @@@ static bool check_should_bypass(struct 
              (bio->bi_rw & REQ_WRITE)))
                 goto skip;
   
-       if (bio->bi_sector & (c->sb.block_size - 1) ||
+       if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
             bio_sectors(bio) & (c->sb.block_size - 1)) {
                 pr_debug("skipping unaligned io");
                 goto skip;
@@@ -545,8 -546,8 +545,8 @@@
   
         spin_lock(&dc->io_lock);
   
-       hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
-               if (i->last == bio->bi_sector &&
+       hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
+               if (i->last == bio->bi_iter.bi_sector &&
                     time_before(jiffies, i->jiffies))
                         goto found;
   
@@@ -555,8 -556,8 +555,8 @@@
         add_sequential(task);
         i->sequential = 0;
   found:
-       if (i->sequential + bio->bi_size > i->sequential)
-               i->sequential   += bio->bi_size;
+       if (i->sequential + bio->bi_iter.bi_size > i->sequential)
+               i->sequential   += bio->bi_iter.bi_size;
   
         i->last                  = bio_end_sector(bio);
         i->jiffies               = jiffies + msecs_to_jiffies(5000);
@@@ -605,7 -606,6 +605,6 @@@ struct search 
         unsigned                insert_bio_sectors;
   
         unsigned                recoverable:1;
-       unsigned                unaligned_bvec:1;
         unsigned                write:1;
         unsigned                read_dirty_data:1;
   
@@@ -649,15 -649,15 +648,15 @@@ static int cache_lookup_fn(struct btree
         struct bkey *bio_key;
         unsigned ptr;
   
-       if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_sector, 0)) <= 0)
+       if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0)
                 return MAP_CONTINUE;
   
         if (KEY_INODE(k) != s->iop.inode ||
-           KEY_START(k) > bio->bi_sector) {
+           KEY_START(k) > bio->bi_iter.bi_sector) {
                 unsigned bio_sectors = bio_sectors(bio);
                 unsigned sectors = KEY_INODE(k) == s->iop.inode
                         ? min_t(uint64_t, INT_MAX,
-                               KEY_START(k) - bio->bi_sector)
+                               KEY_START(k) - bio->bi_iter.bi_sector)
                         : INT_MAX;
   
                 int ret = s->d->cache_miss(b, s, bio, sectors);
@@@ -679,14 -679,14 +678,14 @@@
         if (KEY_DIRTY(k))
                 s->read_dirty_data = true;
   
-       n = bch_bio_split(bio, min_t(uint64_t, INT_MAX,
-                                    KEY_OFFSET(k) - bio->bi_sector),
-                         GFP_NOIO, s->d->bio_split);
+       n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
+                                     KEY_OFFSET(k) - bio->bi_iter.bi_sector),
+                          GFP_NOIO, s->d->bio_split);
   
         bio_key = &container_of(n, struct bbio, bio)->key;
         bch_bkey_copy_single_ptr(bio_key, k, ptr);
   
-       bch_cut_front(&KEY(s->iop.inode, n->bi_sector, 0), bio_key);
+       bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key);
         bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
   
         n->bi_end_io    = bch_cache_read_endio;
@@@ -713,7 -713,7 +712,7 @@@ static void cache_lookup(struct closur
         struct bio *bio = &s->bio.bio;
   
         int ret = bch_btree_map_keys(&s->op, s->iop.c,
-                                    &KEY(s->iop.inode, bio->bi_sector, 0),
+                                    &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
                                      cache_lookup_fn, MAP_END_KEY);
         if (ret == -EAGAIN)
                 continue_at(cl, cache_lookup, bcache_wq);
@@@ -758,10 -758,12 +757,12 @@@ static void bio_complete(struct search 
   static void do_bio_hook(struct search *s)
   {
         struct bio *bio = &s->bio.bio;
-       memcpy(bio, s->orig_bio, sizeof(struct bio));
   
+       bio_init(bio);
+       __bio_clone_fast(bio, s->orig_bio);
         bio->bi_end_io          = request_endio;
         bio->bi_private         = &s->cl;
+ 
         atomic_set(&bio->bi_cnt, 3);
   }
   
@@@ -773,9 -775,6 +774,6 @@@ static void search_free(struct closure 
         if (s->iop.bio)
                 bio_put(s->iop.bio);
   
-       if (s->unaligned_bvec)
-               mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
- 
         closure_debug_destroy(cl);
         mempool_free(s, s->d->c->search);
   }
@@@ -783,7 -782,6 +781,6 @@@
   static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
   {
         struct search *s;
-       struct bio_vec *bv;
   
         s = mempool_alloc(d->c->search, GFP_NOIO);
         memset(s, 0, offsetof(struct search, iop.insert_keys));
@@@ -802,15 -800,6 +799,6 @@@
         s->start_time           = jiffies;
         do_bio_hook(s);
   
-       if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
-               bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
-               memcpy(bv, bio_iovec(bio),
-                      sizeof(struct bio_vec) * bio_segments(bio));
- 
-               s->bio.bio.bi_io_vec    = bv;
-               s->unaligned_bvec       = 1;
-       }
- 
         return s;
   }
   
@@@ -849,26 -838,13 +837,13 @@@ static void cached_dev_read_error(struc
   {
         struct search *s = container_of(cl, struct search, cl);
         struct bio *bio = &s->bio.bio;
-       struct bio_vec *bv;
-       int i;
   
         if (s->recoverable) {
                 /* Retry from the backing device: */
                 trace_bcache_read_retry(s->orig_bio);
   
                 s->iop.error = 0;
-               bv = s->bio.bio.bi_io_vec;
                 do_bio_hook(s);
-               s->bio.bio.bi_io_vec = bv;
- 
-               if (!s->unaligned_bvec)
-                       bio_for_each_segment(bv, s->orig_bio, i)
-                               bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
-               else
-                       memcpy(s->bio.bio.bi_io_vec,
-                              bio_iovec(s->orig_bio),
-                              sizeof(struct bio_vec) *
-                              bio_segments(s->orig_bio));
   
                 /* XXX: invalidate cache */
   
@@@ -893,9 -869,9 +868,9 @@@ static void cached_dev_read_done(struc
   
         if (s->iop.bio) {
                 bio_reset(s->iop.bio);
-               s->iop.bio->bi_sector = s->cache_miss->bi_sector;
+               s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector;
                 s->iop.bio->bi_bdev = s->cache_miss->bi_bdev;
-               s->iop.bio->bi_size = s->insert_bio_sectors << 9;
+               s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
                 bch_bio_map(s->iop.bio, NULL);
   
                 bio_copy_data(s->cache_miss, s->iop.bio);
@@@ -904,8 -880,7 +879,7 @@@
                 s->cache_miss = NULL;
         }
   
-       if (verify(dc, &s->bio.bio) && s->recoverable &&
-           !s->unaligned_bvec && !s->read_dirty_data)
+       if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data)
                 bch_data_verify(dc, s->orig_bio);
   
         bio_complete(s);
@@@ -945,7 -920,7 +919,7 @@@ static int cached_dev_cache_miss(struc
         struct bio *miss, *cache_bio;
   
         if (s->cache_miss || s->iop.bypass) {
-               miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+               miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
                 ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
                 goto out_submit;
         }
@@@ -959,7 -934,7 +933,7 @@@
         s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
   
         s->iop.replace_key = KEY(s->iop.inode,
-                                bio->bi_sector + s->insert_bio_sectors,
+                                bio->bi_iter.bi_sector + s->insert_bio_sectors,
                                  s->insert_bio_sectors);
   
         ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key);
@@@ -968,7 -943,7 +942,7 @@@
   
         s->iop.replace = true;
   
-       miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+       miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
   
         /* btree_search_recurse()'s btree iterator is no good anymore */
         ret = miss == bio ? MAP_DONE : -EINTR;
@@@ -979,9 -954,9 +953,9 @@@
         if (!cache_bio)
                 goto out_submit;
   
-       cache_bio->bi_sector    = miss->bi_sector;
-       cache_bio->bi_bdev      = miss->bi_bdev;
-       cache_bio->bi_size      = s->insert_bio_sectors << 9;
+       cache_bio->bi_iter.bi_sector    = miss->bi_iter.bi_sector;
+       cache_bio->bi_bdev              = miss->bi_bdev;
+       cache_bio->bi_iter.bi_size      = s->insert_bio_sectors << 9;
   
         cache_bio->bi_end_io    = request_endio;
         cache_bio->bi_private   = &s->cl;
@@@ -1031,7 -1006,7 +1005,7 @@@ static void cached_dev_write(struct cac
   {
         struct closure *cl = &s->cl;
         struct bio *bio = &s->bio.bio;
-       struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0);
+       struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0);
         struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
   
         bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end);
@@@ -1087,8 -1062,7 +1061,7 @@@
                         closure_bio_submit(flush, cl, s->d);
                 }
         } else {
-               s->iop.bio = bio_clone_bioset(bio, GFP_NOIO,
-                                             dc->disk.bio_split);
+               s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
   
                 closure_bio_submit(bio, cl, s->d);
         }
@@@ -1126,13 -1100,13 +1099,13 @@@ static void cached_dev_make_request(str
         part_stat_unlock();
   
         bio->bi_bdev = dc->bdev;
-       bio->bi_sector += dc->sb.data_offset;
+       bio->bi_iter.bi_sector += dc->sb.data_offset;
   
         if (cached_dev_get(dc)) {
                 s = search_alloc(bio, d);
                 trace_bcache_request_start(s->d, bio);
   
-               if (!bio->bi_size) {
+               if (!bio->bi_iter.bi_size) {
                         /*
                          * can't call bch_journal_meta from under
                          * generic_make_request
@@@ -1204,24 -1178,24 +1177,24 @@@ void bch_cached_dev_request_init(struc
   static int flash_dev_cache_miss(struct btree *b, struct search *s,
                                 struct bio *bio, unsigned sectors)
   {
-       struct bio_vec *bv;
-       int i;
+       struct bio_vec bv;
+       struct bvec_iter iter;
   
         /* Zero fill bio */
   
-       bio_for_each_segment(bv, bio, i) {
-               unsigned j = min(bv->bv_len >> 9, sectors);
+       bio_for_each_segment(bv, bio, iter) {
+               unsigned j = min(bv.bv_len >> 9, sectors);
   
-               void *p = kmap(bv->bv_page);
-               memset(p + bv->bv_offset, 0, j << 9);
-               kunmap(bv->bv_page);
+               void *p = kmap(bv.bv_page);
+               memset(p + bv.bv_offset, 0, j << 9);
+               kunmap(bv.bv_page);
   
                 sectors -= j;
         }
   
-       bio_advance(bio, min(sectors << 9, bio->bi_size));
+       bio_advance(bio, min(sectors << 9, bio->bi_iter.bi_size));
   
-       if (!bio->bi_size)
+       if (!bio->bi_iter.bi_size)
                 return MAP_DONE;
   
         return MAP_CONTINUE;
@@@ -1255,7 -1229,7 +1228,7 @@@ static void flash_dev_make_request(stru
   
         trace_bcache_request_start(s->d, bio);
   
-       if (!bio->bi_size) {
+       if (!bio->bi_iter.bi_size) {
                 /*
                  * can't call bch_journal_meta from under
                  * generic_make_request
@@@ -1265,7 -1239,7 +1238,7 @@@
                                       bcache_wq);
         } else if (rw) {
                 bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
-                                       &KEY(d->id, bio->bi_sector, 0),
+                                       &KEY(d->id, bio->bi_iter.bi_sector, 0),
                                         &KEY(d->id, bio_end_sector(bio), 0));
   
                 s->iop.bypass           = (bio->bi_rw & REQ_DISCARD) != 0;
diff --combined drivers/md/dm-bufio.c

index 9ed42125514b38d560464e4dd3d741038db06858,a1b58a65d8ed849ecef2217e1b93f2a5d3e42ba1..66c5d130c8c24c4f3101ce78296460da4487f38b
--- 1/drivers/md/dm-bufio.c
--- 2/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@@ -104,8 -104,6 +104,8 @@@ struct dm_bufio_client 
         struct list_head reserved_buffers;
         unsigned need_reserved_buffers;
   
+ +      unsigned minimum_buffers;
+ +
         struct hlist_head *cache_hash;
         wait_queue_head_t free_buffer_wait;
   
@@@ -540,7 -538,7 +540,7 @@@ static void use_inline_bio(struct dm_bu
         bio_init(&b->bio);
         b->bio.bi_io_vec = b->bio_vec;
         b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
-       b->bio.bi_sector = block << b->c->sectors_per_block_bits;
+       b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
         b->bio.bi_bdev = b->c->bdev;
         b->bio.bi_end_io = end_io;
   
@@@ -863,8 -861,8 +863,8 @@@ static void __get_memory_limit(struct d
         buffers = dm_bufio_cache_size_per_client >>
                   (c->sectors_per_block_bits + SECTOR_SHIFT);
   
- -      if (buffers < DM_BUFIO_MIN_BUFFERS)
- -              buffers = DM_BUFIO_MIN_BUFFERS;
+ +      if (buffers < c->minimum_buffers)
+ +              buffers = c->minimum_buffers;
   
         *limit_buffers = buffers;
         *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
@@@ -1352,34 -1350,6 +1352,34 @@@ retry
   }
   EXPORT_SYMBOL_GPL(dm_bufio_release_move);
   
+ +/*
+ + * Free the given buffer.
+ + *
+ + * This is just a hint, if the buffer is in use or dirty, this function
+ + * does nothing.
+ + */
+ +void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
+ +{
+ +      struct dm_buffer *b;
+ +
+ +      dm_bufio_lock(c);
+ +
+ +      b = __find(c, block);
+ +      if (b && likely(!b->hold_count) && likely(!b->state)) {
+ +              __unlink_buffer(b);
+ +              __free_buffer_wake(b);
+ +      }
+ +
+ +      dm_bufio_unlock(c);
+ +}
+ +EXPORT_SYMBOL(dm_bufio_forget);
+ +
+ +void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
+ +{
+ +      c->minimum_buffers = n;
+ +}
+ +EXPORT_SYMBOL(dm_bufio_set_minimum_buffers);
+ +
   unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
   {
         return c->block_size;
@@@ -1576,8 -1546,6 +1576,8 @@@ struct dm_bufio_client *dm_bufio_client
         INIT_LIST_HEAD(&c->reserved_buffers);
         c->need_reserved_buffers = reserved_buffers;
   
+ +      c->minimum_buffers = DM_BUFIO_MIN_BUFFERS;
+ +
         init_waitqueue_head(&c->free_buffer_wait);
         c->async_write_error = 0;
   
diff --combined drivers/md/dm-cache-policy-mq.c

index 930e8c3d73e985b1e75769a9894f13ffd32d756a,d13a16865d03ddc4ec418618d3d8fbdb4909ddc8..1e018e986610a57ef9f82a818aa1f70a8c364e30
--- 1/drivers/md/dm-cache-policy-mq.c
--- 2/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@@ -72,7 -72,7 +72,7 @@@ static enum io_pattern iot_pattern(stru
   
   static void iot_update_stats(struct io_tracker *t, struct bio *bio)
   {
-       if (bio->bi_sector == from_oblock(t->last_end_oblock) + 1)
+       if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1)
                 t->nr_seq_samples++;
         else {
                 /*
@@@ -87,7 -87,7 +87,7 @@@
                 t->nr_rand_samples++;
         }
   
-       t->last_end_oblock = to_oblock(bio->bi_sector + bio_sectors(bio) - 1);
+       t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1);
   }
   
   static void iot_check_for_pattern_switch(struct io_tracker *t)
@@@ -287,8 -287,9 +287,8 @@@ static struct entry *alloc_entry(struc
   static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
   {
         struct entry *e = ep->entries + from_cblock(cblock);
- -      list_del(&e->list);
   
- -      INIT_LIST_HEAD(&e->list);
+ +      list_del_init(&e->list);
         INIT_HLIST_NODE(&e->hlist);
         ep->nr_allocated++;
   
@@@ -390,10 -391,6 +390,10 @@@ struct mq_policy 
          */
         unsigned promote_threshold;
   
+ +      unsigned discard_promote_adjustment;
+ +      unsigned read_promote_adjustment;
+ +      unsigned write_promote_adjustment;
+ +
         /*
          * The hash table allows us to quickly find an entry by origin
          * block.  Both pre_cache and cache entries are in here.
@@@ -403,10 -400,6 +403,10 @@@
         struct hlist_head *table;
   };
   
+ +#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
+ +#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
+ +#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
+ +
   /*----------------------------------------------------------------*/
   
   /*
@@@ -649,21 -642,25 +649,21 @@@ static int demote_cblock(struct mq_poli
    * We bias towards reads, since they can be demoted at no cost if they
    * haven't been dirtied.
    */
- -#define DISCARDED_PROMOTE_THRESHOLD 1
- -#define READ_PROMOTE_THRESHOLD 4
- -#define WRITE_PROMOTE_THRESHOLD 8
- -
   static unsigned adjusted_promote_threshold(struct mq_policy *mq,
                                            bool discarded_oblock, int data_dir)
   {
         if (data_dir == READ)
- -              return mq->promote_threshold + READ_PROMOTE_THRESHOLD;
+ +              return mq->promote_threshold + mq->read_promote_adjustment;
   
         if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
                 /*
                  * We don't need to do any copying at all, so give this a
                  * very low threshold.
                  */
- -              return DISCARDED_PROMOTE_THRESHOLD;
+ +              return mq->discard_promote_adjustment;
         }
   
- -      return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD;
+ +      return mq->promote_threshold + mq->write_promote_adjustment;
   }
   
   static bool should_promote(struct mq_policy *mq, struct entry *e,
@@@ -812,7 -809,7 +812,7 @@@ static int no_entry_found(struct mq_pol
                           bool can_migrate, bool discarded_oblock,
                           int data_dir, struct policy_result *result)
   {
- -      if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) {
+ +      if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
                 if (can_migrate)
                         insert_in_cache(mq, oblock, result);
                 else
@@@ -1138,28 -1135,20 +1138,28 @@@ static int mq_set_config_value(struct d
                                const char *key, const char *value)
   {
         struct mq_policy *mq = to_mq_policy(p);
- -      enum io_pattern pattern;
         unsigned long tmp;
   
- -      if (!strcasecmp(key, "random_threshold"))
- -              pattern = PATTERN_RANDOM;
- -      else if (!strcasecmp(key, "sequential_threshold"))
- -              pattern = PATTERN_SEQUENTIAL;
- -      else
- -              return -EINVAL;
- -
         if (kstrtoul(value, 10, &tmp))
                 return -EINVAL;
   
- -      mq->tracker.thresholds[pattern] = tmp;
+ +      if (!strcasecmp(key, "random_threshold")) {
+ +              mq->tracker.thresholds[PATTERN_RANDOM] = tmp;
+ +
+ +      } else if (!strcasecmp(key, "sequential_threshold")) {
+ +              mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp;
+ +
+ +      } else if (!strcasecmp(key, "discard_promote_adjustment"))
+ +              mq->discard_promote_adjustment = tmp;
+ +
+ +      else if (!strcasecmp(key, "read_promote_adjustment"))
+ +              mq->read_promote_adjustment = tmp;
+ +
+ +      else if (!strcasecmp(key, "write_promote_adjustment"))
+ +              mq->write_promote_adjustment = tmp;
+ +
+ +      else
+ +              return -EINVAL;
   
         return 0;
   }
@@@ -1169,16 -1158,9 +1169,16 @@@ static int mq_emit_config_values(struc
         ssize_t sz = 0;
         struct mq_policy *mq = to_mq_policy(p);
   
- -      DMEMIT("4 random_threshold %u sequential_threshold %u",
+ +      DMEMIT("10 random_threshold %u "
+ +             "sequential_threshold %u "
+ +             "discard_promote_adjustment %u "
+ +             "read_promote_adjustment %u "
+ +             "write_promote_adjustment %u",
                mq->tracker.thresholds[PATTERN_RANDOM],
- -             mq->tracker.thresholds[PATTERN_SEQUENTIAL]);
+ +             mq->tracker.thresholds[PATTERN_SEQUENTIAL],
+ +             mq->discard_promote_adjustment,
+ +             mq->read_promote_adjustment,
+ +             mq->write_promote_adjustment);
   
         return 0;
   }
@@@ -1231,9 -1213,6 +1231,9 @@@ static struct dm_cache_policy *mq_creat
         mq->hit_count = 0;
         mq->generation = 0;
         mq->promote_threshold = 0;
+ +      mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
+ +      mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
+ +      mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
         mutex_init(&mq->lock);
         spin_lock_init(&mq->tick_lock);
   
@@@ -1265,7 -1244,7 +1265,7 @@@ bad_pre_cache_init
   
   static struct dm_cache_policy_type mq_policy_type = {
         .name = "mq",
- -      .version = {1, 1, 0},
+ +      .version = {1, 2, 0},
         .hint_size = 4,
         .owner = THIS_MODULE,
         .create = mq_create
@@@ -1273,11 -1252,10 +1273,11 @@@
   
   static struct dm_cache_policy_type default_policy_type = {
         .name = "default",
- -      .version = {1, 1, 0},
+ +      .version = {1, 2, 0},
         .hint_size = 4,
         .owner = THIS_MODULE,
- -      .create = mq_create
+ +      .create = mq_create,
+ +      .real = &mq_policy_type
   };
   
   static int __init mq_init(void)
diff --combined drivers/md/dm-cache-target.c

index 09334c275c79e91c7bf4fd41e18e641b2196073a,99f91628a33aa6b6969d2b87b61a861e85d16de6..ffd472e015caa918facaed4f65a621c0f61e58a9
--- 1/drivers/md/dm-cache-target.c
--- 2/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@@ -85,6 -85,12 +85,12 @@@ static void dm_unhook_bio(struct dm_hoo
   {
         bio->bi_end_io = h->bi_end_io;
         bio->bi_private = h->bi_private;
+ 
+       /*
+        * Must bump bi_remaining to allow bio to complete with
+        * restored bi_end_io.
+        */
+       atomic_inc(&bio->bi_remaining);
   }
   
   /*----------------------------------------------------------------*/
@@@ -664,15 -670,17 +670,17 @@@ static void remap_to_origin(struct cach
   static void remap_to_cache(struct cache *cache, struct bio *bio,
                            dm_cblock_t cblock)
   {
-       sector_t bi_sector = bio->bi_sector;
+       sector_t bi_sector = bio->bi_iter.bi_sector;
   
         bio->bi_bdev = cache->cache_dev->bdev;
         if (!block_size_is_power_of_two(cache))
-               bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
-                               sector_div(bi_sector, cache->sectors_per_block);
+               bio->bi_iter.bi_sector =
+                       (from_cblock(cblock) * cache->sectors_per_block) +
+                       sector_div(bi_sector, cache->sectors_per_block);
         else
-               bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
-                               (bi_sector & (cache->sectors_per_block - 1));
+               bio->bi_iter.bi_sector =
+                       (from_cblock(cblock) << cache->sectors_per_block_shift) |
+                       (bi_sector & (cache->sectors_per_block - 1));
   }
   
   static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
@@@ -712,7 -720,7 +720,7 @@@ static void remap_to_cache_dirty(struc
   
   static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
   {
-       sector_t block_nr = bio->bi_sector;
+       sector_t block_nr = bio->bi_iter.bi_sector;
   
         if (!block_size_is_power_of_two(cache))
                 (void) sector_div(block_nr, cache->sectors_per_block);
@@@ -1027,7 -1035,7 +1035,7 @@@ static void issue_overwrite(struct dm_c
   static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
   {
         return (bio_data_dir(bio) == WRITE) &&
-               (bio->bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
+               (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
   }
   
   static void avoid_copy(struct dm_cache_migration *mg)
@@@ -1252,7 -1260,7 +1260,7 @@@ static void process_flush_bio(struct ca
         size_t pb_data_size = get_per_bio_data_size(cache);
         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
   
-       BUG_ON(bio->bi_size);
+       BUG_ON(bio->bi_iter.bi_size);
         if (!pb->req_nr)
                 remap_to_origin(cache, bio);
         else
@@@ -1275,9 -1283,9 +1283,9 @@@
    */
   static void process_discard_bio(struct cache *cache, struct bio *bio)
   {
-       dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
+       dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector,
                                                   cache->discard_block_size);
-       dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
+       dm_block_t end_block = bio_end_sector(bio);
         dm_block_t b;
   
         end_block = block_div(end_block, cache->discard_block_size);
@@@ -2826,13 -2834,12 +2834,13 @@@ static void cache_resume(struct dm_targ
   /*
    * Status format:
    *
- - * <#used metadata blocks>/<#total metadata blocks>
+ + * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
+ + * <cache block size> <#used cache blocks>/<#total cache blocks>
    * <#read hits> <#read misses> <#write hits> <#write misses>
- - * <#demotions> <#promotions> <#blocks in cache> <#dirty>
+ + * <#demotions> <#promotions> <#dirty>
    * <#features> <features>*
    * <#core args> <core args>
- - * <#policy args> <policy args>*
+ + * <policy name> <#policy args> <policy args>*
    */
   static void cache_status(struct dm_target *ti, status_type_t type,
                          unsigned status_flags, char *result, unsigned maxlen)
@@@ -2870,20 -2877,17 +2878,20 @@@
   
                 residency = policy_residency(cache->policy);
   
- -              DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
+ +              DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %llu ",
+ +                     (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
                        (unsigned long long)nr_blocks_metadata,
+ +                     cache->sectors_per_block,
+ +                     (unsigned long long) from_cblock(residency),
+ +                     (unsigned long long) from_cblock(cache->cache_size),
                        (unsigned) atomic_read(&cache->stats.read_hit),
                        (unsigned) atomic_read(&cache->stats.read_miss),
                        (unsigned) atomic_read(&cache->stats.write_hit),
                        (unsigned) atomic_read(&cache->stats.write_miss),
                        (unsigned) atomic_read(&cache->stats.demotion),
                        (unsigned) atomic_read(&cache->stats.promotion),
- -                     (unsigned long long) from_cblock(residency),
- -                     cache->nr_dirty);
+ +                     (unsigned long long) from_cblock(cache->nr_dirty));
   
                 if (writethrough_mode(&cache->features))
                         DMEMIT("1 writethrough ");
@@@ -2900,8 -2904,6 +2908,8 @@@
                 }
   
                 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
+ +
+ +              DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
                 if (sz < maxlen) {
                         r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
                         if (r)
@@@ -3135,7 -3137,7 +3143,7 @@@ static void cache_io_hints(struct dm_ta
   
   static struct target_type cache_target = {
         .name = "cache",
- -      .version = {1, 2, 0},
+ +      .version = {1, 3, 0},
         .module = THIS_MODULE,
         .ctr = cache_ctr,
         .dtr = cache_dtr,
diff --combined drivers/md/dm-delay.c

index a8a511c053a5d5fda6574933e616719256768d31,fc8482a65dd27d0083e1c0f226c619c4cc3142f8..42c3a27a14cc3a906b5f892a6206de348b6b58ee
--- 1/drivers/md/dm-delay.c
--- 2/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@@ -24,6 -24,7 +24,6 @@@ struct delay_c 
         struct work_struct flush_expired_bios;
         struct list_head delayed_bios;
         atomic_t may_delay;
- -      mempool_t *delayed_pool;
   
         struct dm_dev *dev_read;
         sector_t start_read;
@@@ -39,11 -40,14 +39,11 @@@
   struct dm_delay_info {
         struct delay_c *context;
         struct list_head list;
- -      struct bio *bio;
         unsigned long expires;
   };
   
   static DEFINE_MUTEX(delayed_bios_lock);
   
- -static struct kmem_cache *delayed_cache;
- -
   static void handle_delayed_timer(unsigned long data)
   {
         struct delay_c *dc = (struct delay_c *)data;
@@@ -83,14 -87,13 +83,14 @@@ static struct bio *flush_delayed_bios(s
         mutex_lock(&delayed_bios_lock);
         list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
                 if (flush_all || time_after_eq(jiffies, delayed->expires)) {
+ +                      struct bio *bio = dm_bio_from_per_bio_data(delayed,
+ +                                              sizeof(struct dm_delay_info));
                         list_del(&delayed->list);
- -                      bio_list_add(&flush_bios, delayed->bio);
- -                      if ((bio_data_dir(delayed->bio) == WRITE))
+ +                      bio_list_add(&flush_bios, bio);
+ +                      if ((bio_data_dir(bio) == WRITE))
                                 delayed->context->writes--;
                         else
                                 delayed->context->reads--;
- -                      mempool_free(delayed, dc->delayed_pool);
                         continue;
                 }
   
@@@ -182,6 -185,12 +182,6 @@@ static int delay_ctr(struct dm_target *
         }
   
   out:
- -      dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache);
- -      if (!dc->delayed_pool) {
- -              DMERR("Couldn't create delayed bio pool.");
- -              goto bad_dev_write;
- -      }
- -
         dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
         if (!dc->kdelayd_wq) {
                 DMERR("Couldn't start kdelayd");
@@@ -197,11 -206,12 +197,11 @@@
   
         ti->num_flush_bios = 1;
         ti->num_discard_bios = 1;
+ +      ti->per_bio_data_size = sizeof(struct dm_delay_info);
         ti->private = dc;
         return 0;
   
   bad_queue:
- -      mempool_destroy(dc->delayed_pool);
- -bad_dev_write:
         if (dc->dev_write)
                 dm_put_device(ti, dc->dev_write);
   bad_dev_read:
@@@ -222,6 -232,7 +222,6 @@@ static void delay_dtr(struct dm_target 
         if (dc->dev_write)
                 dm_put_device(ti, dc->dev_write);
   
- -      mempool_destroy(dc->delayed_pool);
         kfree(dc);
   }
   
@@@ -233,9 -244,10 +233,9 @@@ static int delay_bio(struct delay_c *dc
         if (!delay || !atomic_read(&dc->may_delay))
                 return 1;
   
- -      delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO);
+ +      delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
   
         delayed->context = dc;
- -      delayed->bio = bio;
         delayed->expires = expires = jiffies + (delay * HZ / 1000);
   
         mutex_lock(&delayed_bios_lock);
@@@ -277,14 -289,15 +277,15 @@@ static int delay_map(struct dm_target *
         if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
                 bio->bi_bdev = dc->dev_write->bdev;
                 if (bio_sectors(bio))
-                       bio->bi_sector = dc->start_write +
-                                        dm_target_offset(ti, bio->bi_sector);
+                       bio->bi_iter.bi_sector = dc->start_write +
+                               dm_target_offset(ti, bio->bi_iter.bi_sector);
   
                 return delay_bio(dc, dc->write_delay, bio);
         }
   
         bio->bi_bdev = dc->dev_read->bdev;
-       bio->bi_sector = dc->start_read + dm_target_offset(ti, bio->bi_sector);
+       bio->bi_iter.bi_sector = dc->start_read +
+               dm_target_offset(ti, bio->bi_iter.bi_sector);
   
         return delay_bio(dc, dc->read_delay, bio);
   }
@@@ -344,7 -357,13 +345,7 @@@ static struct target_type delay_target 
   
   static int __init dm_delay_init(void)
   {
- -      int r = -ENOMEM;
- -
- -      delayed_cache = KMEM_CACHE(dm_delay_info, 0);
- -      if (!delayed_cache) {
- -              DMERR("Couldn't create delayed bio cache.");
- -              goto bad_memcache;
- -      }
+ +      int r;
   
         r = dm_register_target(&delay_target);
         if (r < 0) {
@@@ -355,12 -374,15 +356,12 @@@
         return 0;
   
   bad_register:
- -      kmem_cache_destroy(delayed_cache);
- -bad_memcache:
         return r;
   }
   
   static void __exit dm_delay_exit(void)
   {
         dm_unregister_target(&delay_target);
- -      kmem_cache_destroy(delayed_cache);
   }
   
   /* Module hooks */
diff --combined drivers/md/dm-snap.c

index 717718558bd9908469b23bbb9b3cd0223ac243f3,01b6a11813f29e5ab32e3af144714ab28dc26089..ebddef5237e4b28e6254e486b3267dbccca9864e
--- 1/drivers/md/dm-snap.c
--- 2/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@@ -610,12 -610,12 +610,12 @@@ static struct dm_exception *dm_lookup_e
         return NULL;
   }
   
- -static struct dm_exception *alloc_completed_exception(void)
+ +static struct dm_exception *alloc_completed_exception(gfp_t gfp)
   {
         struct dm_exception *e;
   
- -      e = kmem_cache_alloc(exception_cache, GFP_NOIO);
- -      if (!e)
+ +      e = kmem_cache_alloc(exception_cache, gfp);
+ +      if (!e && gfp == GFP_NOIO)
                 e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
   
         return e;
@@@ -697,7 -697,7 +697,7 @@@ static int dm_add_exception(void *conte
         struct dm_snapshot *s = context;
         struct dm_exception *e;
   
- -      e = alloc_completed_exception();
+ +      e = alloc_completed_exception(GFP_KERNEL);
         if (!e)
                 return -ENOMEM;
   
@@@ -1405,7 -1405,7 +1405,7 @@@ static void pending_complete(struct dm_
                 goto out;
         }
   
- -      e = alloc_completed_exception();
+ +      e = alloc_completed_exception(GFP_NOIO);
         if (!e) {
                 down_write(&s->lock);
                 __invalidate_snapshot(s, -ENOMEM);
@@@ -1438,6 -1438,7 +1438,7 @@@ out
         if (full_bio) {
                 full_bio->bi_end_io = pe->full_bio_end_io;
                 full_bio->bi_private = pe->full_bio_private;
+               atomic_inc(&full_bio->bi_remaining);
         }
         free_pending_exception(pe);
   
@@@ -1619,11 -1620,10 +1620,10 @@@ static void remap_exception(struct dm_s
                             struct bio *bio, chunk_t chunk)
   {
         bio->bi_bdev = s->cow->bdev;
-       bio->bi_sector = chunk_to_sector(s->store,
-                                        dm_chunk_number(e->new_chunk) +
-                                        (chunk - e->old_chunk)) +
-                                        (bio->bi_sector &
-                                         s->store->chunk_mask);
+       bio->bi_iter.bi_sector =
+               chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) +
+                               (chunk - e->old_chunk)) +
+               (bio->bi_iter.bi_sector & s->store->chunk_mask);
   }
   
   static int snapshot_map(struct dm_target *ti, struct bio *bio)
@@@ -1641,7 -1641,7 +1641,7 @@@
                 return DM_MAPIO_REMAPPED;
         }
   
-       chunk = sector_to_chunk(s->store, bio->bi_sector);
+       chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
   
         /* Full snapshots are not usable */
         /* To get here the table must be live so s->active is always set. */
@@@ -1702,7 -1702,8 +1702,8 @@@
                 r = DM_MAPIO_SUBMITTED;
   
                 if (!pe->started &&
-                   bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) {
+                   bio->bi_iter.bi_size ==
+                   (s->store->chunk_size << SECTOR_SHIFT)) {
                         pe->started = 1;
                         up_write(&s->lock);
                         start_full_bio(pe, bio);
@@@ -1758,7 -1759,7 +1759,7 @@@ static int snapshot_merge_map(struct dm
                 return DM_MAPIO_REMAPPED;
         }
   
-       chunk = sector_to_chunk(s->store, bio->bi_sector);
+       chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
   
         down_write(&s->lock);
   
@@@ -2095,7 -2096,7 +2096,7 @@@ static int do_origin(struct dm_dev *ori
         down_read(&_origins_lock);
         o = __lookup_origin(origin->bdev);
         if (o)
-               r = __origin_write(&o->snapshots, bio->bi_sector, bio);
+               r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
         up_read(&_origins_lock);
   
         return r;
diff --combined drivers/md/dm-thin.c

index 726228b33a012f9994fc2f8843b25a0ca46ef966,357eb272dbd9d3337bf5d1a13db443261eb2b24b..faaf944597ab7669b90f3ecb85152fbcd16cbe33
--- 1/drivers/md/dm-thin.c
--- 2/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@@ -144,7 -144,6 +144,7 @@@ struct pool_features 
         bool zero_new_blocks:1;
         bool discard_enabled:1;
         bool discard_passdown:1;
+ +      bool error_if_no_space:1;
   };
   
   struct thin_c;
@@@ -164,7 -163,8 +164,7 @@@ struct pool 
         int sectors_per_block_shift;
   
         struct pool_features pf;
- -      unsigned low_water_triggered:1; /* A dm event has been sent */
- -      unsigned no_free_space:1;       /* A -ENOSPC warning has been issued */
+ +      bool low_water_triggered:1;     /* A dm event has been sent */
   
         struct dm_bio_prison *prison;
         struct dm_kcopyd_client *copier;
@@@ -198,8 -198,7 +198,8 @@@
   };
   
   static enum pool_mode get_pool_mode(struct pool *pool);
- -static void set_pool_mode(struct pool *pool, enum pool_mode mode);
+ +static void out_of_data_space(struct pool *pool);
+ +static void metadata_operation_failed(struct pool *pool, const char *op, int r);
   
   /*
    * Target context for a pool.
@@@ -414,7 -413,7 +414,7 @@@ static bool block_size_is_power_of_two(
   static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
   {
         struct pool *pool = tc->pool;
-       sector_t block_nr = bio->bi_sector;
+       sector_t block_nr = bio->bi_iter.bi_sector;
   
         if (block_size_is_power_of_two(pool))
                 block_nr >>= pool->sectors_per_block_shift;
@@@ -427,14 -426,15 +427,15 @@@
   static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
   {
         struct pool *pool = tc->pool;
-       sector_t bi_sector = bio->bi_sector;
+       sector_t bi_sector = bio->bi_iter.bi_sector;
   
         bio->bi_bdev = tc->pool_dev->bdev;
         if (block_size_is_power_of_two(pool))
-               bio->bi_sector = (block << pool->sectors_per_block_shift) |
-                               (bi_sector & (pool->sectors_per_block - 1));
+               bio->bi_iter.bi_sector =
+                       (block << pool->sectors_per_block_shift) |
+                       (bi_sector & (pool->sectors_per_block - 1));
         else
-               bio->bi_sector = (block * pool->sectors_per_block) +
+               bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
                                  sector_div(bi_sector, pool->sectors_per_block);
   }
   
@@@ -510,16 -510,15 +511,16 @@@ static void remap_and_issue(struct thin
   struct dm_thin_new_mapping {
         struct list_head list;
   
- -      unsigned quiesced:1;
- -      unsigned prepared:1;
- -      unsigned pass_discard:1;
+ +      bool quiesced:1;
+ +      bool prepared:1;
+ +      bool pass_discard:1;
+ +      bool definitely_not_shared:1;
   
+ +      int err;
         struct thin_c *tc;
         dm_block_t virt_block;
         dm_block_t data_block;
         struct dm_bio_prison_cell *cell, *cell2;
- -      int err;
   
         /*
          * If the bio covers the whole area of a block then we can avoid
@@@ -536,7 -535,7 +537,7 @@@ static void __maybe_add_mapping(struct 
         struct pool *pool = m->tc->pool;
   
         if (m->quiesced && m->prepared) {
- -              list_add(&m->list, &pool->prepared_mappings);
+ +              list_add_tail(&m->list, &pool->prepared_mappings);
                 wake_worker(pool);
         }
   }
@@@ -550,7 -549,7 +551,7 @@@ static void copy_complete(int read_err
         m->err = read_err || write_err ? -EIO : 0;
   
         spin_lock_irqsave(&pool->lock, flags);
- -      m->prepared = 1;
+ +      m->prepared = true;
         __maybe_add_mapping(m);
         spin_unlock_irqrestore(&pool->lock, flags);
   }
@@@ -565,7 -564,7 +566,7 @@@ static void overwrite_endio(struct bio 
         m->err = err;
   
         spin_lock_irqsave(&pool->lock, flags);
- -      m->prepared = 1;
+ +      m->prepared = true;
         __maybe_add_mapping(m);
         spin_unlock_irqrestore(&pool->lock, flags);
   }
@@@ -612,8 -611,10 +613,10 @@@ static void cell_defer_no_holder(struc
   
   static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
   {
-       if (m->bio)
+       if (m->bio) {
                 m->bio->bi_end_io = m->saved_bi_end_io;
+               atomic_inc(&m->bio->bi_remaining);
+       }
         cell_error(m->tc->pool, m->cell);
         list_del(&m->list);
         mempool_free(m, m->tc->pool->mapping_pool);
@@@ -627,8 -628,10 +630,10 @@@ static void process_prepared_mapping(st
         int r;
   
         bio = m->bio;
-       if (bio)
+       if (bio) {
                 bio->bi_end_io = m->saved_bi_end_io;
+               atomic_inc(&bio->bi_remaining);
+       }
   
         if (m->err) {
                 cell_error(pool, m->cell);
@@@ -642,7 -645,9 +647,7 @@@
          */
         r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
         if (r) {
- -              DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d",
- -                          dm_device_name(pool->pool_md), r);
- -              set_pool_mode(pool, PM_READ_ONLY);
+ +              metadata_operation_failed(pool, "dm_thin_insert_block", r);
                 cell_error(pool, m->cell);
                 goto out;
         }
@@@ -683,15 -688,7 +688,15 @@@ static void process_prepared_discard_pa
         cell_defer_no_holder(tc, m->cell2);
   
         if (m->pass_discard)
- -              remap_and_issue(tc, m->bio, m->data_block);
+ +              if (m->definitely_not_shared)
+ +                      remap_and_issue(tc, m->bio, m->data_block);
+ +              else {
+ +                      bool used = false;
+ +                      if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
+ +                              bio_endio(m->bio, 0);
+ +                      else
+ +                              remap_and_issue(tc, m->bio, m->data_block);
+ +              }
         else
                 bio_endio(m->bio, 0);
   
@@@ -731,7 -728,8 +736,8 @@@ static void process_prepared(struct poo
    */
   static int io_overlaps_block(struct pool *pool, struct bio *bio)
   {
-       return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
+       return bio->bi_iter.bi_size ==
+               (pool->sectors_per_block << SECTOR_SHIFT);
   }
   
   static int io_overwrites_block(struct pool *pool, struct bio *bio)
@@@ -759,17 -757,13 +765,17 @@@ static int ensure_next_mapping(struct p
   
   static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
   {
- -      struct dm_thin_new_mapping *r = pool->next_mapping;
+ +      struct dm_thin_new_mapping *m = pool->next_mapping;
   
         BUG_ON(!pool->next_mapping);
   
+ +      memset(m, 0, sizeof(struct dm_thin_new_mapping));
+ +      INIT_LIST_HEAD(&m->list);
+ +      m->bio = NULL;
+ +
         pool->next_mapping = NULL;
   
- -      return r;
+ +      return m;
   }
   
   static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
@@@ -781,13 -775,18 +787,13 @@@
         struct pool *pool = tc->pool;
         struct dm_thin_new_mapping *m = get_next_mapping(pool);
   
- -      INIT_LIST_HEAD(&m->list);
- -      m->quiesced = 0;
- -      m->prepared = 0;
         m->tc = tc;
         m->virt_block = virt_block;
         m->data_block = data_dest;
         m->cell = cell;
- -      m->err = 0;
- -      m->bio = NULL;
   
         if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
- -              m->quiesced = 1;
+ +              m->quiesced = true;
   
         /*
          * IO to pool_dev remaps to the pool target's data_dev.
@@@ -847,12 -846,15 +853,12 @@@ static void schedule_zero(struct thin_
         struct pool *pool = tc->pool;
         struct dm_thin_new_mapping *m = get_next_mapping(pool);
   
- -      INIT_LIST_HEAD(&m->list);
- -      m->quiesced = 1;
- -      m->prepared = 0;
+ +      m->quiesced = true;
+ +      m->prepared = false;
         m->tc = tc;
         m->virt_block = virt_block;
         m->data_block = data_block;
         m->cell = cell;
- -      m->err = 0;
- -      m->bio = NULL;
   
         /*
          * If the whole block of data is being overwritten or we are not
@@@ -899,42 -901,41 +905,42 @@@ static int commit(struct pool *pool
                 return -EINVAL;
   
         r = dm_pool_commit_metadata(pool->pmd);
- -      if (r) {
- -              DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d",
- -                          dm_device_name(pool->pool_md), r);
- -              set_pool_mode(pool, PM_READ_ONLY);
- -      }
+ +      if (r)
+ +              metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
   
         return r;
   }
   
- -static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
+ +static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
   {
- -      int r;
- -      dm_block_t free_blocks;
         unsigned long flags;
- -      struct pool *pool = tc->pool;
- -
- -      /*
- -       * Once no_free_space is set we must not allow allocation to succeed.
- -       * Otherwise it is difficult to explain, debug, test and support.
- -       */
- -      if (pool->no_free_space)
- -              return -ENOSPC;
- -
- -      r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
- -      if (r)
- -              return r;
   
         if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
                 DMWARN("%s: reached low water mark for data device: sending event.",
                        dm_device_name(pool->pool_md));
                 spin_lock_irqsave(&pool->lock, flags);
- -              pool->low_water_triggered = 1;
+ +              pool->low_water_triggered = true;
                 spin_unlock_irqrestore(&pool->lock, flags);
                 dm_table_event(pool->ti->table);
         }
+ +}
+ +
+ +static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
+ +{
+ +      int r;
+ +      dm_block_t free_blocks;
+ +      struct pool *pool = tc->pool;
+ +
+ +      if (get_pool_mode(pool) != PM_WRITE)
+ +              return -EINVAL;
+ +
+ +      r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
+ +      if (r) {
+ +              metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
+ +              return r;
+ +      }
+ +
+ +      check_low_water_mark(pool, free_blocks);
   
         if (!free_blocks) {
                 /*
@@@ -946,20 -947,35 +952,20 @@@
                         return r;
   
                 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
- -              if (r)
+ +              if (r) {
+ +                      metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
                         return r;
+ +              }
   
- -              /*
- -               * If we still have no space we set a flag to avoid
- -               * doing all this checking and return -ENOSPC.  This
- -               * flag serves as a latch that disallows allocations from
- -               * this pool until the admin takes action (e.g. resize or
- -               * table reload).
- -               */
                 if (!free_blocks) {
- -                      DMWARN("%s: no free data space available.",
- -                             dm_device_name(pool->pool_md));
- -                      spin_lock_irqsave(&pool->lock, flags);
- -                      pool->no_free_space = 1;
- -                      spin_unlock_irqrestore(&pool->lock, flags);
+ +                      out_of_data_space(pool);
                         return -ENOSPC;
                 }
         }
   
         r = dm_pool_alloc_data_block(pool->pmd, result);
         if (r) {
- -              if (r == -ENOSPC &&
- -                  !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
- -                  !free_blocks) {
- -                      DMWARN("%s: no free metadata space available.",
- -                             dm_device_name(pool->pool_md));
- -                      set_pool_mode(pool, PM_READ_ONLY);
- -              }
+ +              metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
                 return r;
         }
   
@@@ -982,21 -998,7 +988,21 @@@ static void retry_on_resume(struct bio 
         spin_unlock_irqrestore(&pool->lock, flags);
   }
   
- -static void no_space(struct pool *pool, struct dm_bio_prison_cell *cell)
+ +static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
+ +{
+ +      /*
+ +       * When pool is read-only, no cell locking is needed because
+ +       * nothing is changing.
+ +       */
+ +      WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY);
+ +
+ +      if (pool->pf.error_if_no_space)
+ +              bio_io_error(bio);
+ +      else
+ +              retry_on_resume(bio);
+ +}
+ +
+ +static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
   {
         struct bio *bio;
         struct bio_list bios;
@@@ -1005,7 -1007,7 +1011,7 @@@
         cell_release(pool, cell, &bios);
   
         while ((bio = bio_list_pop(&bios)))
- -              retry_on_resume(bio);
+ +              handle_unserviceable_bio(pool, bio);
   }
   
   static void process_discard(struct thin_c *tc, struct bio *bio)
@@@ -1044,17 -1046,17 +1050,17 @@@
                          */
                         m = get_next_mapping(pool);
                         m->tc = tc;
- -                      m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
+ +                      m->pass_discard = pool->pf.discard_passdown;
+ +                      m->definitely_not_shared = !lookup_result.shared;
                         m->virt_block = block;
                         m->data_block = lookup_result.block;
                         m->cell = cell;
                         m->cell2 = cell2;
- -                      m->err = 0;
                         m->bio = bio;
   
                         if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
                                 spin_lock_irqsave(&pool->lock, flags);
- -                              list_add(&m->list, &pool->prepared_discards);
+ +                              list_add_tail(&m->list, &pool->prepared_discards);
                                 spin_unlock_irqrestore(&pool->lock, flags);
                                 wake_worker(pool);
                         }
@@@ -1109,12 -1111,13 +1115,12 @@@ static void break_sharing(struct thin_
                 break;
   
         case -ENOSPC:
- -              no_space(pool, cell);
+ +              retry_bios_on_resume(pool, cell);
                 break;
   
         default:
                 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
                             __func__, r);
- -              set_pool_mode(pool, PM_READ_ONLY);
                 cell_error(pool, cell);
                 break;
         }
@@@ -1136,7 -1139,7 +1142,7 @@@ static void process_shared_bio(struct t
         if (bio_detain(pool, &key, bio, &cell))
                 return;
   
-       if (bio_data_dir(bio) == WRITE && bio->bi_size)
+       if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size)
                 break_sharing(tc, bio, block, &key, lookup_result, cell);
         else {
                 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@@ -1159,7 -1162,7 +1165,7 @@@ static void provision_block(struct thin
         /*
          * Remap empty bios (flushes) immediately, without provisioning.
          */
-       if (!bio->bi_size) {
+       if (!bio->bi_iter.bi_size) {
                 inc_all_io_entry(pool, bio);
                 cell_defer_no_holder(tc, cell);
   
@@@ -1187,12 -1190,13 +1193,12 @@@
                 break;
   
         case -ENOSPC:
- -              no_space(pool, cell);
+ +              retry_bios_on_resume(pool, cell);
                 break;
   
         default:
                 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
                             __func__, r);
- -              set_pool_mode(pool, PM_READ_ONLY);
                 cell_error(pool, cell);
                 break;
         }
@@@ -1258,8 -1262,8 +1264,8 @@@ static void process_bio_read_only(struc
         r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
         switch (r) {
         case 0:
-               if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
+               if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size)
- -                      bio_io_error(bio);
+ +                      handle_unserviceable_bio(tc->pool, bio);
                 else {
                         inc_all_io_entry(tc->pool, bio);
                         remap_and_issue(tc, bio, lookup_result.block);
@@@ -1268,7 -1272,7 +1274,7 @@@
   
         case -ENODATA:
                 if (rw != READ) {
- -                      bio_io_error(bio);
+ +                      handle_unserviceable_bio(tc->pool, bio);
                         break;
                 }
   
@@@ -1392,16 -1396,16 +1398,16 @@@ static enum pool_mode get_pool_mode(str
         return pool->pf.mode;
   }
   
- -static void set_pool_mode(struct pool *pool, enum pool_mode mode)
+ +static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
   {
         int r;
+ +      enum pool_mode old_mode = pool->pf.mode;
   
- -      pool->pf.mode = mode;
- -
- -      switch (mode) {
+ +      switch (new_mode) {
         case PM_FAIL:
- -              DMERR("%s: switching pool to failure mode",
- -                    dm_device_name(pool->pool_md));
+ +              if (old_mode != new_mode)
+ +                      DMERR("%s: switching pool to failure mode",
+ +                            dm_device_name(pool->pool_md));
                 dm_pool_metadata_read_only(pool->pmd);
                 pool->process_bio = process_bio_fail;
                 pool->process_discard = process_bio_fail;
@@@ -1410,15 -1414,13 +1416,15 @@@
                 break;
   
         case PM_READ_ONLY:
- -              DMERR("%s: switching pool to read-only mode",
- -                    dm_device_name(pool->pool_md));
+ +              if (old_mode != new_mode)
+ +                      DMERR("%s: switching pool to read-only mode",
+ +                            dm_device_name(pool->pool_md));
                 r = dm_pool_abort_metadata(pool->pmd);
                 if (r) {
                         DMERR("%s: aborting transaction failed",
                               dm_device_name(pool->pool_md));
- -                      set_pool_mode(pool, PM_FAIL);
+ +                      new_mode = PM_FAIL;
+ +                      set_pool_mode(pool, new_mode);
                 } else {
                         dm_pool_metadata_read_only(pool->pmd);
                         pool->process_bio = process_bio_read_only;
@@@ -1429,9 -1431,6 +1435,9 @@@
                 break;
   
         case PM_WRITE:
+ +              if (old_mode != new_mode)
+ +                      DMINFO("%s: switching pool to write mode",
+ +                             dm_device_name(pool->pool_md));
                 dm_pool_metadata_read_write(pool->pmd);
                 pool->process_bio = process_bio;
                 pool->process_discard = process_discard;
@@@ -1439,35 -1438,6 +1445,35 @@@
                 pool->process_prepared_discard = process_prepared_discard;
                 break;
         }
+ +
+ +      pool->pf.mode = new_mode;
+ +}
+ +
+ +/*
+ + * Rather than calling set_pool_mode directly, use these which describe the
+ + * reason for mode degradation.
+ + */
+ +static void out_of_data_space(struct pool *pool)
+ +{
+ +      DMERR_LIMIT("%s: no free data space available.",
+ +                  dm_device_name(pool->pool_md));
+ +      set_pool_mode(pool, PM_READ_ONLY);
+ +}
+ +
+ +static void metadata_operation_failed(struct pool *pool, const char *op, int r)
+ +{
+ +      dm_block_t free_blocks;
+ +
+ +      DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
+ +                  dm_device_name(pool->pool_md), op, r);
+ +
+ +      if (r == -ENOSPC &&
+ +          !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
+ +          !free_blocks)
+ +              DMERR_LIMIT("%s: no free metadata space available.",
+ +                          dm_device_name(pool->pool_md));
+ +
+ +      set_pool_mode(pool, PM_READ_ONLY);
   }
   
   /*----------------------------------------------------------------*/
@@@ -1574,9 -1544,9 +1580,9 @@@ static int thin_bio_map(struct dm_targe
                 if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
                         /*
                          * This block isn't provisioned, and we have no way
- -                       * of doing so.  Just error it.
+ +                       * of doing so.
                          */
- -                      bio_io_error(bio);
+ +                      handle_unserviceable_bio(tc->pool, bio);
                         return DM_MAPIO_SUBMITTED;
                 }
                 /* fall through */
@@@ -1683,17 -1653,6 +1689,17 @@@ static int bind_control_target(struct p
         enum pool_mode old_mode = pool->pf.mode;
         enum pool_mode new_mode = pt->adjusted_pf.mode;
   
+ +      /*
+ +       * Don't change the pool's mode until set_pool_mode() below.
+ +       * Otherwise the pool's process_* function pointers may
+ +       * not match the desired pool mode.
+ +       */
+ +      pt->adjusted_pf.mode = old_mode;
+ +
+ +      pool->ti = ti;
+ +      pool->pf = pt->adjusted_pf;
+ +      pool->low_water_blocks = pt->low_water_blocks;
+ +
         /*
          * If we were in PM_FAIL mode, rollback of metadata failed.  We're
          * not going to recover without a thin_repair.  So we never let the
@@@ -1704,6 -1663,10 +1710,6 @@@
         if (old_mode == PM_FAIL)
                 new_mode = old_mode;
   
- -      pool->ti = ti;
- -      pool->low_water_blocks = pt->low_water_blocks;
- -      pool->pf = pt->adjusted_pf;
- -
         set_pool_mode(pool, new_mode);
   
         return 0;
@@@ -1725,7 -1688,6 +1731,7 @@@ static void pool_features_init(struct p
         pf->zero_new_blocks = true;
         pf->discard_enabled = true;
         pf->discard_passdown = true;
+ +      pf->error_if_no_space = false;
   }
   
   static void __pool_destroy(struct pool *pool)
@@@ -1816,7 -1778,8 +1822,7 @@@ static struct pool *pool_create(struct 
         bio_list_init(&pool->deferred_flush_bios);
         INIT_LIST_HEAD(&pool->prepared_mappings);
         INIT_LIST_HEAD(&pool->prepared_discards);
- -      pool->low_water_triggered = 0;
- -      pool->no_free_space = 0;
+ +      pool->low_water_triggered = false;
         bio_list_init(&pool->retry_on_resume_list);
   
         pool->shared_read_ds = dm_deferred_set_create();
@@@ -1941,7 -1904,7 +1947,7 @@@ static int parse_pool_features(struct d
         const char *arg_name;
   
         static struct dm_arg _args[] = {
- -              {0, 3, "Invalid number of pool feature arguments"},
+ +              {0, 4, "Invalid number of pool feature arguments"},
         };
   
         /*
@@@ -1970,9 -1933,6 +1976,9 @@@
                 else if (!strcasecmp(arg_name, "read_only"))
                         pf->mode = PM_READ_ONLY;
   
+ +              else if (!strcasecmp(arg_name, "error_if_no_space"))
+ +                      pf->error_if_no_space = true;
+ +
                 else {
                         ti->error = "Unrecognised pool feature requested";
                         r = -EINVAL;
@@@ -2043,8 -2003,6 +2049,8 @@@ static dm_block_t calc_metadata_thresho
    *         skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
    *         ignore_discard: disable discard
    *         no_discard_passdown: don't pass discards down to the data device
+ + *         read_only: Don't allow any changes to be made to the pool metadata.
+ + *         error_if_no_space: error IOs, instead of queueing, if no space.
    */
   static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
   {
@@@ -2240,13 -2198,11 +2246,13 @@@ static int maybe_resize_data_dev(struc
                 return -EINVAL;
   
         } else if (data_size > sb_data_size) {
+ +              if (sb_data_size)
+ +                      DMINFO("%s: growing the data device from %llu to %llu blocks",
+ +                             dm_device_name(pool->pool_md),
+ +                             sb_data_size, (unsigned long long)data_size);
                 r = dm_pool_resize_data_dev(pool->pmd, data_size);
                 if (r) {
- -                      DMERR("%s: failed to resize data device",
- -                            dm_device_name(pool->pool_md));
- -                      set_pool_mode(pool, PM_READ_ONLY);
+ +                      metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
                         return r;
                 }
   
@@@ -2281,12 -2237,10 +2287,12 @@@ static int maybe_resize_metadata_dev(st
                 return -EINVAL;
   
         } else if (metadata_dev_size > sb_metadata_dev_size) {
+ +              DMINFO("%s: growing the metadata device from %llu to %llu blocks",
+ +                     dm_device_name(pool->pool_md),
+ +                     sb_metadata_dev_size, metadata_dev_size);
                 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
                 if (r) {
- -                      DMERR("%s: failed to resize metadata device",
- -                            dm_device_name(pool->pool_md));
+ +                      metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
                         return r;
                 }
   
@@@ -2342,7 -2296,8 +2348,7 @@@ static void pool_resume(struct dm_targe
         unsigned long flags;
   
         spin_lock_irqsave(&pool->lock, flags);
- -      pool->low_water_triggered = 0;
- -      pool->no_free_space = 0;
+ +      pool->low_water_triggered = false;
         __requeue_bios(pool);
         spin_unlock_irqrestore(&pool->lock, flags);
   
@@@ -2561,8 -2516,7 +2567,8 @@@ static void emit_flags(struct pool_feat
                        unsigned sz, unsigned maxlen)
   {
         unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
- -              !pf->discard_passdown + (pf->mode == PM_READ_ONLY);
+ +              !pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
+ +              pf->error_if_no_space;
         DMEMIT("%u ", count);
   
         if (!pf->zero_new_blocks)
@@@ -2576,9 -2530,6 +2582,9 @@@
   
         if (pf->mode == PM_READ_ONLY)
                 DMEMIT("read_only ");
+ +
+ +      if (pf->error_if_no_space)
+ +              DMEMIT("error_if_no_space ");
   }
   
   /*
@@@ -2673,16 -2624,11 +2679,16 @@@ static void pool_status(struct dm_targe
                         DMEMIT("rw ");
   
                 if (!pool->pf.discard_enabled)
- -                      DMEMIT("ignore_discard");
+ +                      DMEMIT("ignore_discard ");
                 else if (pool->pf.discard_passdown)
- -                      DMEMIT("discard_passdown");
+ +                      DMEMIT("discard_passdown ");
+ +              else
+ +                      DMEMIT("no_discard_passdown ");
+ +
+ +              if (pool->pf.error_if_no_space)
+ +                      DMEMIT("error_if_no_space ");
                 else
- -                      DMEMIT("no_discard_passdown");
+ +                      DMEMIT("queue_if_no_space ");
   
                 break;
   
@@@ -2781,7 -2727,7 +2787,7 @@@ static struct target_type pool_target 
         .name = "thin-pool",
         .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
                     DM_TARGET_IMMUTABLE,
- -      .version = {1, 9, 0},
+ +      .version = {1, 10, 0},
         .module = THIS_MODULE,
         .ctr = pool_ctr,
         .dtr = pool_dtr,
@@@ -2939,7 -2885,7 +2945,7 @@@ out_unlock
   
   static int thin_map(struct dm_target *ti, struct bio *bio)
   {
-       bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
+       bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
   
         return thin_bio_map(ti, bio);
   }
@@@ -2959,7 -2905,7 +2965,7 @@@ static int thin_endio(struct dm_target 
                 spin_lock_irqsave(&pool->lock, flags);
                 list_for_each_entry_safe(m, tmp, &work, list) {
                         list_del(&m->list);
- -                      m->quiesced = 1;
+ +                      m->quiesced = true;
                         __maybe_add_mapping(m);
                 }
                 spin_unlock_irqrestore(&pool->lock, flags);
@@@ -2971,7 -2917,7 +2977,7 @@@
                 if (!list_empty(&work)) {
                         spin_lock_irqsave(&pool->lock, flags);
                         list_for_each_entry_safe(m, tmp, &work, list)
- -                              list_add(&m->list, &pool->prepared_discards);
+ +                              list_add_tail(&m->list, &pool->prepared_discards);
                         spin_unlock_irqrestore(&pool->lock, flags);
                         wake_worker(pool);
                 }
@@@ -3068,7 -3014,7 +3074,7 @@@ static int thin_iterate_devices(struct 
   
   static struct target_type thin_target = {
         .name = "thin",
- -      .version = {1, 9, 0},
+ +      .version = {1, 10, 0},
         .module = THIS_MODULE,
         .ctr = thin_ctr,
         .dtr = thin_dtr,
diff --combined drivers/md/dm.c

index b49c7628424171f0622ed4446e5c4111b00ba418,44a2fa6814ce97cbd05d3e3e34c65672c6dbbebe..8c53b09b9a2c5a3050b22f4fba82af5563f1d59a
--- 1/drivers/md/dm.c
--- 2/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@@ -200,8 -200,8 +200,8 @@@ struct mapped_device 
         /* forced geometry settings */
         struct hd_geometry geometry;
   
- -      /* sysfs handle */
- -      struct kobject kobj;
+ +      /* kobject and completion */
+ +      struct dm_kobject_holder kobj_holder;
   
         /* zero-length flush that will be cloned and submitted to targets */
         struct bio flush_bio;
@@@ -575,7 -575,7 +575,7 @@@ static void start_io_acct(struct dm_io 
                 atomic_inc_return(&md->pending[rw]));
   
         if (unlikely(dm_stats_used(&md->stats)))
-               dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
+               dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
                                     bio_sectors(bio), false, 0, &io->stats_aux);
   }
   
@@@ -593,7 -593,7 +593,7 @@@ static void end_io_acct(struct dm_io *i
         part_stat_unlock();
   
         if (unlikely(dm_stats_used(&md->stats)))
-               dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
+               dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
                                     bio_sectors(bio), true, duration, &io->stats_aux);
   
         /*
@@@ -742,7 -742,7 +742,7 @@@ static void dec_pending(struct dm_io *i
                 if (io_error == DM_ENDIO_REQUEUE)
                         return;
   
-               if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
+               if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) {
                         /*
                          * Preflush done for flush with data, reissue
                          * without REQ_FLUSH.
@@@ -797,7 -797,7 +797,7 @@@ static void end_clone_bio(struct bio *c
         struct dm_rq_clone_bio_info *info = clone->bi_private;
         struct dm_rq_target_io *tio = info->tio;
         struct bio *bio = info->orig;
-       unsigned int nr_bytes = info->orig->bi_size;
+       unsigned int nr_bytes = info->orig->bi_iter.bi_size;
   
         bio_put(clone);
   
@@@ -1128,7 -1128,7 +1128,7 @@@ static void __map_bio(struct dm_target_
          * this io.
          */
         atomic_inc(&tio->io->io_count);
-       sector = clone->bi_sector;
+       sector = clone->bi_iter.bi_sector;
         r = ti->type->map(ti, clone);
         if (r == DM_MAPIO_REMAPPED) {
                 /* the bio has been remapped so dispatch it */
@@@ -1155,76 -1155,32 +1155,32 @@@ struct clone_info 
         struct dm_io *io;
         sector_t sector;
         sector_t sector_count;
-       unsigned short idx;
   };
   
   static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
   {
-       bio->bi_sector = sector;
-       bio->bi_size = to_bytes(len);
- }
- 
- static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count)
- {
-       bio->bi_idx = idx;
-       bio->bi_vcnt = idx + bv_count;
-       bio->bi_flags &= ~(1 << BIO_SEG_VALID);
- }
- 
- static void clone_bio_integrity(struct bio *bio, struct bio *clone,
-                               unsigned short idx, unsigned len, unsigned offset,
-                               unsigned trim)
- {
-       if (!bio_integrity(bio))
-               return;
- 
-       bio_integrity_clone(clone, bio, GFP_NOIO);
- 
-       if (trim)
-               bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len);
- }
- 
- /*
-  * Creates a little bio that just does part of a bvec.
-  */
- static void clone_split_bio(struct dm_target_io *tio, struct bio *bio,
-                           sector_t sector, unsigned short idx,
-                           unsigned offset, unsigned len)
- {
-       struct bio *clone = &tio->clone;
-       struct bio_vec *bv = bio->bi_io_vec + idx;
- 
-       *clone->bi_io_vec = *bv;
- 
-       bio_setup_sector(clone, sector, len);
- 
-       clone->bi_bdev = bio->bi_bdev;
-       clone->bi_rw = bio->bi_rw;
-       clone->bi_vcnt = 1;
-       clone->bi_io_vec->bv_offset = offset;
-       clone->bi_io_vec->bv_len = clone->bi_size;
-       clone->bi_flags |= 1 << BIO_CLONED;
- 
-       clone_bio_integrity(bio, clone, idx, len, offset, 1);
+       bio->bi_iter.bi_sector = sector;
+       bio->bi_iter.bi_size = to_bytes(len);
   }
   
   /*
    * Creates a bio that consists of range of complete bvecs.
    */
   static void clone_bio(struct dm_target_io *tio, struct bio *bio,
-                     sector_t sector, unsigned short idx,
-                     unsigned short bv_count, unsigned len)
+                     sector_t sector, unsigned len)
   {
         struct bio *clone = &tio->clone;
-       unsigned trim = 0;
   
-       __bio_clone(clone, bio);
-       bio_setup_sector(clone, sector, len);
-       bio_setup_bv(clone, idx, bv_count);
+       __bio_clone_fast(clone, bio);
+ 
+       if (bio_integrity(bio))
+               bio_integrity_clone(clone, bio, GFP_NOIO);
   
-       if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
-               trim = 1;
-       clone_bio_integrity(bio, clone, idx, len, 0, trim);
+       bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
+       clone->bi_iter.bi_size = to_bytes(len);
+ 
+       if (bio_integrity(bio))
+               bio_integrity_trim(clone, 0, len);
   }
   
   static struct dm_target_io *alloc_tio(struct clone_info *ci,
@@@ -1257,7 -1213,7 +1213,7 @@@ static void __clone_and_map_simple_bio(
          * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
          * and discard, so no need for concern about wasted bvec allocations.
          */
-        __bio_clone(clone, ci->bio);
+        __bio_clone_fast(clone, ci->bio);
         if (len)
                 bio_setup_sector(clone, ci->sector, len);
   
@@@ -1286,10 -1242,7 +1242,7 @@@ static int __send_empty_flush(struct cl
   }
   
   static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
-                                    sector_t sector, int nr_iovecs,
-                                    unsigned short idx, unsigned short bv_count,
-                                    unsigned offset, unsigned len,
-                                    unsigned split_bvec)
+                                    sector_t sector, unsigned len)
   {
         struct bio *bio = ci->bio;
         struct dm_target_io *tio;
@@@ -1303,11 -1256,8 +1256,8 @@@
                 num_target_bios = ti->num_write_bios(ti, bio);
   
         for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
-               tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr);
-               if (split_bvec)
-                       clone_split_bio(tio, bio, sector, idx, offset, len);
-               else
-                       clone_bio(tio, bio, sector, idx, bv_count, len);
+               tio = alloc_tio(ci, ti, 0, target_bio_nr);
+               clone_bio(tio, bio, sector, len);
                 __map_bio(tio);
         }
   }
@@@ -1378,60 -1328,6 +1328,6 @@@ static int __send_write_same(struct clo
         return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
   }
   
- /*
-  * Find maximum number of sectors / bvecs we can process with a single bio.
-  */
- static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx)
- {
-       struct bio *bio = ci->bio;
-       sector_t bv_len, total_len = 0;
- 
-       for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) {
-               bv_len = to_sector(bio->bi_io_vec[*idx].bv_len);
- 
-               if (bv_len > max)
-                       break;
- 
-               max -= bv_len;
-               total_len += bv_len;
-       }
- 
-       return total_len;
- }
- 
- static int __split_bvec_across_targets(struct clone_info *ci,
-                                      struct dm_target *ti, sector_t max)
- {
-       struct bio *bio = ci->bio;
-       struct bio_vec *bv = bio->bi_io_vec + ci->idx;
-       sector_t remaining = to_sector(bv->bv_len);
-       unsigned offset = 0;
-       sector_t len;
- 
-       do {
-               if (offset) {
-                       ti = dm_table_find_target(ci->map, ci->sector);
-                       if (!dm_target_is_valid(ti))
-                               return -EIO;
- 
-                       max = max_io_len(ci->sector, ti);
-               }
- 
-               len = min(remaining, max);
- 
-               __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0,
-                                        bv->bv_offset + offset, len, 1);
- 
-               ci->sector += len;
-               ci->sector_count -= len;
-               offset += to_bytes(len);
-       } while (remaining -= len);
- 
-       ci->idx++;
- 
-       return 0;
- }
- 
   /*
    * Select the correct strategy for processing a non-flush bio.
    */
@@@ -1439,8 -1335,7 +1335,7 @@@ static int __split_and_process_non_flus
   {
         struct bio *bio = ci->bio;
         struct dm_target *ti;
-       sector_t len, max;
-       int idx;
+       unsigned len;
   
         if (unlikely(bio->bi_rw & REQ_DISCARD))
                 return __send_discard(ci);
@@@ -1451,41 -1346,14 +1346,14 @@@
         if (!dm_target_is_valid(ti))
                 return -EIO;
   
-       max = max_io_len(ci->sector, ti);
- 
-       /*
-        * Optimise for the simple case where we can do all of
-        * the remaining io with a single clone.
-        */
-       if (ci->sector_count <= max) {
-               __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
-                                        ci->idx, bio->bi_vcnt - ci->idx, 0,
-                                        ci->sector_count, 0);
-               ci->sector_count = 0;
-               return 0;
-       }
+       len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
   
-       /*
-        * There are some bvecs that don't span targets.
-        * Do as many of these as possible.
-        */
-       if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
-               len = __len_within_target(ci, max, &idx);
+       __clone_and_map_data_bio(ci, ti, ci->sector, len);
   
-               __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
-                                        ci->idx, idx - ci->idx, 0, len, 0);
+       ci->sector += len;
+       ci->sector_count -= len;
   
-               ci->sector += len;
-               ci->sector_count -= len;
-               ci->idx = idx;
- 
-               return 0;
-       }
- 
-       /*
-        * Handle a bvec that must be split between two or more targets.
-        */
-       return __split_bvec_across_targets(ci, ti, max);
+       return 0;
   }
   
   /*
@@@ -1510,8 -1378,7 +1378,7 @@@ static void __split_and_process_bio(str
         ci.io->bio = bio;
         ci.io->md = md;
         spin_lock_init(&ci.io->endio_lock);
-       ci.sector = bio->bi_sector;
-       ci.idx = bio->bi_idx;
+       ci.sector = bio->bi_iter.bi_sector;
   
         start_io_acct(ci.io);
   
@@@ -2041,7 -1908,6 +1908,7 @@@ static struct mapped_device *alloc_dev(
         init_waitqueue_head(&md->wait);
         INIT_WORK(&md->work, dm_wq_work);
         init_waitqueue_head(&md->eventq);
+ +      init_completion(&md->kobj_holder.completion);
   
         md->disk->major = _major;
         md->disk->first_minor = minor;
@@@ -2903,14 -2769,20 +2770,14 @@@ struct gendisk *dm_disk(struct mapped_d
   
   struct kobject *dm_kobject(struct mapped_device *md)
   {
- -      return &md->kobj;
+ +      return &md->kobj_holder.kobj;
   }
   
- -/*
- - * struct mapped_device should not be exported outside of dm.c
- - * so use this check to verify that kobj is part of md structure
- - */
   struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
   {
         struct mapped_device *md;
   
- -      md = container_of(kobj, struct mapped_device, kobj);
- -      if (&md->kobj != kobj)
- -              return NULL;
+ +      md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
   
         if (test_bit(DMF_FREEING, &md->flags) ||
             dm_deleting_md(md))
diff --combined drivers/md/md.c

index 40c531359a15af61ad9c3ba70506d1863085dffe,16d84e091e2d199222f3c58203a07f3b696249c9..4ad5cc4e63e8438ca3c32fea1f40f69ec71657fb
--- 1/drivers/md/md.c
--- 2/drivers/md/md.c
+++ b/drivers/md/md.c
@@@ -393,7 -393,7 +393,7 @@@ static void md_submit_flush_data(struc
         struct mddev *mddev = container_of(ws, struct mddev, flush_work);
         struct bio *bio = mddev->flush_bio;
   
-       if (bio->bi_size == 0)
+       if (bio->bi_iter.bi_size == 0)
                 /* an empty barrier - all done */
                 bio_endio(bio, 0);
         else {
@@@ -754,7 -754,7 +754,7 @@@ void md_super_write(struct mddev *mddev
         struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
   
         bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
-       bio->bi_sector = sector;
+       bio->bi_iter.bi_sector = sector;
         bio_add_page(bio, page, size, 0);
         bio->bi_private = rdev;
         bio->bi_end_io = super_written;
@@@ -782,18 -782,16 +782,16 @@@ int sync_page_io(struct md_rdev *rdev, 
         struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
         int ret;
   
-       rw |= REQ_SYNC;
- 
         bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
                 rdev->meta_bdev : rdev->bdev;
         if (metadata_op)
-               bio->bi_sector = sector + rdev->sb_start;
+               bio->bi_iter.bi_sector = sector + rdev->sb_start;
         else if (rdev->mddev->reshape_position != MaxSector &&
                  (rdev->mddev->reshape_backwards ==
                   (sector >= rdev->mddev->reshape_position)))
-               bio->bi_sector = sector + rdev->new_data_offset;
+               bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
         else
-               bio->bi_sector = sector + rdev->data_offset;
+               bio->bi_iter.bi_sector = sector + rdev->data_offset;
         bio_add_page(bio, page, size, 0);
         submit_bio_wait(rw, bio);
   
@@@ -1077,7 -1075,6 +1075,7 @@@ static int super_90_validate(struct mdd
         rdev->raid_disk = -1;
         clear_bit(Faulty, &rdev->flags);
         clear_bit(In_sync, &rdev->flags);
+ +      clear_bit(Bitmap_sync, &rdev->flags);
         clear_bit(WriteMostly, &rdev->flags);
   
         if (mddev->raid_disks == 0) {
@@@ -1156,8 -1153,6 +1154,8 @@@
                  */
                 if (ev1 < mddev->bitmap->events_cleared)
                         return 0;
+ +              if (ev1 < mddev->events)
+ +                      set_bit(Bitmap_sync, &rdev->flags);
         } else {
                 if (ev1 < mddev->events)
                         /* just a hot-add of a new device, leave raid_disk at -1 */
@@@ -1173,7 -1168,6 +1171,7 @@@
                             desc->raid_disk < mddev->raid_disks */) {
                         set_bit(In_sync, &rdev->flags);
                         rdev->raid_disk = desc->raid_disk;
+ +                      rdev->saved_raid_disk = desc->raid_disk;
                 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
                         /* active but not in sync implies recovery up to
                          * reshape position.  We don't know exactly where
@@@ -1567,7 -1561,6 +1565,7 @@@ static int super_1_validate(struct mdde
         rdev->raid_disk = -1;
         clear_bit(Faulty, &rdev->flags);
         clear_bit(In_sync, &rdev->flags);
+ +      clear_bit(Bitmap_sync, &rdev->flags);
         clear_bit(WriteMostly, &rdev->flags);
   
         if (mddev->raid_disks == 0) {
@@@ -1650,8 -1643,6 +1648,8 @@@
                  */
                 if (ev1 < mddev->bitmap->events_cleared)
                         return 0;
+ +              if (ev1 < mddev->events)
+ +                      set_bit(Bitmap_sync, &rdev->flags);
         } else {
                 if (ev1 < mddev->events)
                         /* just a hot-add of a new device, leave raid_disk at -1 */
@@@ -1672,14 -1663,10 +1670,14 @@@
                         set_bit(Faulty, &rdev->flags);
                         break;
                 default:
+ +                      rdev->saved_raid_disk = role;
                         if ((le32_to_cpu(sb->feature_map) &
- -                           MD_FEATURE_RECOVERY_OFFSET))
+ +                           MD_FEATURE_RECOVERY_OFFSET)) {
                                 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
- -                      else
+ +                              if (!(le32_to_cpu(sb->feature_map) &
+ +                                    MD_FEATURE_RECOVERY_BITMAP))
+ +                                      rdev->saved_raid_disk = -1;
+ +                      } else
                                 set_bit(In_sync, &rdev->flags);
                         rdev->raid_disk = role;
                         break;
@@@ -1741,9 -1728,6 +1739,9 @@@ static void super_1_sync(struct mddev *
                         cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
                 sb->recovery_offset =
                         cpu_to_le64(rdev->recovery_offset);
+ +              if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
+ +                      sb->feature_map |=
+ +                              cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
         }
         if (test_bit(Replacement, &rdev->flags))
                 sb->feature_map |=
@@@ -2485,7 -2469,8 +2483,7 @@@ repeat
                 if (rdev->sb_loaded != 1)
                         continue; /* no noise on spare devices */
   
- -              if (!test_bit(Faulty, &rdev->flags) &&
- -                  rdev->saved_raid_disk == -1) {
+ +              if (!test_bit(Faulty, &rdev->flags)) {
                         md_super_write(mddev,rdev,
                                        rdev->sb_start, rdev->sb_size,
                                        rdev->sb_page);
@@@ -2501,9 -2486,11 +2499,9 @@@
                                 rdev->badblocks.size = 0;
                         }
   
- -              } else if (test_bit(Faulty, &rdev->flags))
+ +              } else
                         pr_debug("md: %s (skipping faulty)\n",
                                  bdevname(rdev->bdev, b));
- -              else
- -                      pr_debug("(skipping incremental s/r ");
   
                 if (mddev->level == LEVEL_MULTIPATH)
                         /* only need to write one superblock... */
@@@ -2619,8 -2606,6 +2617,8 @@@ state_store(struct md_rdev *rdev, cons
          *  blocked - sets the Blocked flags
          *  -blocked - clears the Blocked and possibly simulates an error
          *  insync - sets Insync providing device isn't active
+ +       *  -insync - clear Insync for a device with a slot assigned,
+ +       *            so that it gets rebuilt based on bitmap
          *  write_error - sets WriteErrorSeen
          *  -write_error - clears WriteErrorSeen
          */
@@@ -2669,11 -2654,6 +2667,11 @@@
         } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
                 set_bit(In_sync, &rdev->flags);
                 err = 0;
+ +      } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
+ +              clear_bit(In_sync, &rdev->flags);
+ +              rdev->saved_raid_disk = rdev->raid_disk;
+ +              rdev->raid_disk = -1;
+ +              err = 0;
         } else if (cmd_match(buf, "write_error")) {
                 set_bit(WriteErrorSeen, &rdev->flags);
                 err = 0;
@@@ -2806,7 -2786,6 +2804,7 @@@ slot_store(struct md_rdev *rdev, const 
                 else
                         rdev->saved_raid_disk = -1;
                 clear_bit(In_sync, &rdev->flags);
+ +              clear_bit(Bitmap_sync, &rdev->flags);
                 err = rdev->mddev->pers->
                         hot_add_disk(rdev->mddev, rdev);
                 if (err) {
@@@ -3601,8 -3580,6 +3599,8 @@@ level_store(struct mddev *mddev, const 
         pers->run(mddev);
         set_bit(MD_CHANGE_DEVS, &mddev->flags);
         mddev_resume(mddev);
+ +      if (!mddev->thread)
+ +              md_update_sb(mddev, 1);
         sysfs_notify(&mddev->kobj, NULL, "level");
         md_new_event(mddev);
         return rv;
@@@ -5781,10 -5758,8 +5779,10 @@@ static int add_new_disk(struct mddev * 
                             info->raid_disk < mddev->raid_disks) {
                                 rdev->raid_disk = info->raid_disk;
                                 set_bit(In_sync, &rdev->flags);
+ +                              clear_bit(Bitmap_sync, &rdev->flags);
                         } else
                                 rdev->raid_disk = -1;
+ +                      rdev->saved_raid_disk = rdev->raid_disk;
                 } else
                         super_types[mddev->major_version].
                                 validate_super(mddev, rdev);
@@@ -5797,6 -5772,11 +5795,6 @@@
                         return -EINVAL;
                 }
   
- -              if (test_bit(In_sync, &rdev->flags))
- -                      rdev->saved_raid_disk = rdev->raid_disk;
- -              else
- -                      rdev->saved_raid_disk = -1;
- -
                 clear_bit(In_sync, &rdev->flags); /* just to be sure */
                 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
                         set_bit(WriteMostly, &rdev->flags);
@@@ -6346,32 -6326,6 +6344,32 @@@ static int md_getgeo(struct block_devic
         return 0;
   }
   
+ +static inline bool md_ioctl_valid(unsigned int cmd)
+ +{
+ +      switch (cmd) {
+ +      case ADD_NEW_DISK:
+ +      case BLKROSET:
+ +      case GET_ARRAY_INFO:
+ +      case GET_BITMAP_FILE:
+ +      case GET_DISK_INFO:
+ +      case HOT_ADD_DISK:
+ +      case HOT_REMOVE_DISK:
+ +      case PRINT_RAID_DEBUG:
+ +      case RAID_AUTORUN:
+ +      case RAID_VERSION:
+ +      case RESTART_ARRAY_RW:
+ +      case RUN_ARRAY:
+ +      case SET_ARRAY_INFO:
+ +      case SET_BITMAP_FILE:
+ +      case SET_DISK_FAULTY:
+ +      case STOP_ARRAY:
+ +      case STOP_ARRAY_RO:
+ +              return true;
+ +      default:
+ +              return false;
+ +      }
+ +}
+ +
   static int md_ioctl(struct block_device *bdev, fmode_t mode,
                         unsigned int cmd, unsigned long arg)
   {
@@@ -6380,9 -6334,6 +6378,9 @@@
         struct mddev *mddev = NULL;
         int ro;
   
+ +      if (!md_ioctl_valid(cmd))
+ +              return -ENOTTY;
+ +
         switch (cmd) {
         case RAID_VERSION:
         case GET_ARRAY_INFO:
@@@ -7753,12 -7704,10 +7751,12 @@@ static int remove_and_add_spares(struc
                 if (test_bit(Faulty, &rdev->flags))
                         continue;
                 if (mddev->ro &&
- -                  rdev->saved_raid_disk < 0)
+ +                  ! (rdev->saved_raid_disk >= 0 &&
+ +                     !test_bit(Bitmap_sync, &rdev->flags)))
                         continue;
   
- -              rdev->recovery_offset = 0;
+ +              if (rdev->saved_raid_disk < 0)
+ +                      rdev->recovery_offset = 0;
                 if (mddev->pers->
                     hot_add_disk(mddev, rdev) == 0) {
                         if (sysfs_link_rdev(mddev, rdev))
@@@ -7836,12 -7785,9 +7834,12 @@@ void md_check_recovery(struct mddev *md
                          * As we only add devices that are already in-sync,
                          * we can activate the spares immediately.
                          */
- -                      clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
                         remove_and_add_spares(mddev, NULL);
- -                      mddev->pers->spare_active(mddev);
+ +                      /* There is no thread, but we need to call
+ +                       * ->spare_active and clear saved_raid_disk
+ +                       */
+ +                      md_reap_sync_thread(mddev);
+ +                      clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
                         goto unlock;
                 }
   
@@@ -7978,10 -7924,14 +7976,10 @@@ void md_reap_sync_thread(struct mddev *
                 mddev->pers->finish_reshape(mddev);
   
         /* If array is no-longer degraded, then any saved_raid_disk
- -       * information must be scrapped.  Also if any device is now
- -       * In_sync we must scrape the saved_raid_disk for that device
- -       * do the superblock for an incrementally recovered device
- -       * written out.
+ +       * information must be scrapped.
          */
- -      rdev_for_each(rdev, mddev)
- -              if (!mddev->degraded ||
- -                  test_bit(In_sync, &rdev->flags))
+ +      if (!mddev->degraded)
+ +              rdev_for_each(rdev, mddev)
                         rdev->saved_raid_disk = -1;
   
         md_update_sb(mddev, 1);
diff --combined drivers/md/raid1.c

index a49cfcc7a343188a5579350886795ce6fef35c4f,db3b9d7314f1835def74642faaff39af892d1d8c..fd3a2a14b587da5e3bb5046b0017ed7bd46f67a1
--- 1/drivers/md/raid1.c
--- 2/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@@ -229,7 -229,7 +229,7 @@@ static void call_bio_endio(struct r1bi
         int done;
         struct r1conf *conf = r1_bio->mddev->private;
         sector_t start_next_window = r1_bio->start_next_window;
-       sector_t bi_sector = bio->bi_sector;
+       sector_t bi_sector = bio->bi_iter.bi_sector;
   
         if (bio->bi_phys_segments) {
                 unsigned long flags;
@@@ -265,9 -265,8 +265,8 @@@ static void raid_end_bio_io(struct r1bi
         if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
                 pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
                          (bio_data_dir(bio) == WRITE) ? "write" : "read",
-                        (unsigned long long) bio->bi_sector,
-                        (unsigned long long) bio->bi_sector +
-                        bio_sectors(bio) - 1);
+                        (unsigned long long) bio->bi_iter.bi_sector,
+                        (unsigned long long) bio_end_sector(bio) - 1);
   
                 call_bio_endio(r1_bio);
         }
@@@ -466,9 -465,8 +465,8 @@@ static void raid1_end_write_request(str
                                 struct bio *mbio = r1_bio->master_bio;
                                 pr_debug("raid1: behind end write sectors"
                                          " %llu-%llu\n",
-                                        (unsigned long long) mbio->bi_sector,
-                                        (unsigned long long) mbio->bi_sector +
-                                        bio_sectors(mbio) - 1);
+                                        (unsigned long long) mbio->bi_iter.bi_sector,
+                                        (unsigned long long) bio_end_sector(mbio) - 1);
                                 call_bio_endio(r1_bio);
                         }
                 }
@@@ -875,7 -873,7 +873,7 @@@ static bool need_to_wait_for_sync(struc
                 else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
                                 >= bio_end_sector(bio)) ||
                          (conf->next_resync + NEXT_NORMALIO_DISTANCE
-                               <= bio->bi_sector))
+                               <= bio->bi_iter.bi_sector))
                         wait = false;
                 else
                         wait = true;
@@@ -913,19 -911,20 +911,19 @@@ static sector_t wait_barrier(struct r1c
   
         if (bio && bio_data_dir(bio) == WRITE) {
                 if (conf->next_resync + NEXT_NORMALIO_DISTANCE
-                   <= bio->bi_sector) {
+                   <= bio->bi_iter.bi_sector) {
                         if (conf->start_next_window == MaxSector)
                                 conf->start_next_window =
                                         conf->next_resync +
                                         NEXT_NORMALIO_DISTANCE;
   
                         if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
-                           <= bio->bi_sector)
+                           <= bio->bi_iter.bi_sector)
                                 conf->next_window_requests++;
                         else
                                 conf->current_window_requests++;
- -              }
- -              if (bio->bi_iter.bi_sector >= conf->start_next_window)
                         sector = conf->start_next_window;
+ +              }
         }
   
         conf->nr_pending++;
@@@ -1027,7 -1026,8 +1025,8 @@@ do_sync_io
                 if (bvecs[i].bv_page)
                         put_page(bvecs[i].bv_page);
         kfree(bvecs);
-       pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
+       pr_debug("%dB behind alloc failed, doing sync I/O\n",
+                bio->bi_iter.bi_size);
   }
   
   struct raid1_plug_cb {
@@@ -1107,7 -1107,7 +1106,7 @@@ static void make_request(struct mddev *
   
         if (bio_data_dir(bio) == WRITE &&
             bio_end_sector(bio) > mddev->suspend_lo &&
-           bio->bi_sector < mddev->suspend_hi) {
+           bio->bi_iter.bi_sector < mddev->suspend_hi) {
                 /* As the suspend_* range is controlled by
                  * userspace, we want an interruptible
                  * wait.
@@@ -1118,7 -1118,7 +1117,7 @@@
                         prepare_to_wait(&conf->wait_barrier,
                                         &w, TASK_INTERRUPTIBLE);
                         if (bio_end_sector(bio) <= mddev->suspend_lo ||
-                           bio->bi_sector >= mddev->suspend_hi)
+                           bio->bi_iter.bi_sector >= mddev->suspend_hi)
                                 break;
                         schedule();
                 }
@@@ -1140,7 -1140,7 +1139,7 @@@
         r1_bio->sectors = bio_sectors(bio);
         r1_bio->state = 0;
         r1_bio->mddev = mddev;
-       r1_bio->sector = bio->bi_sector;
+       r1_bio->sector = bio->bi_iter.bi_sector;
   
         /* We might need to issue multiple reads to different
          * devices if there are bad blocks around, so we keep
@@@ -1180,12 -1180,13 +1179,13 @@@ read_again
                 r1_bio->read_disk = rdisk;
   
                 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-               bio_trim(read_bio, r1_bio->sector - bio->bi_sector,
+               bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
                          max_sectors);
   
                 r1_bio->bios[rdisk] = read_bio;
   
-               read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
+               read_bio->bi_iter.bi_sector = r1_bio->sector +
+                       mirror->rdev->data_offset;
                 read_bio->bi_bdev = mirror->rdev->bdev;
                 read_bio->bi_end_io = raid1_end_read_request;
                 read_bio->bi_rw = READ | do_sync;
@@@ -1197,7 -1198,7 +1197,7 @@@
                          */
   
                         sectors_handled = (r1_bio->sector + max_sectors
-                                          - bio->bi_sector);
+                                          - bio->bi_iter.bi_sector);
                         r1_bio->sectors = max_sectors;
                         spin_lock_irq(&conf->device_lock);
                         if (bio->bi_phys_segments == 0)
@@@ -1218,7 -1219,8 +1218,8 @@@
                         r1_bio->sectors = bio_sectors(bio) - sectors_handled;
                         r1_bio->state = 0;
                         r1_bio->mddev = mddev;
-                       r1_bio->sector = bio->bi_sector + sectors_handled;
+                       r1_bio->sector = bio->bi_iter.bi_sector +
+                               sectors_handled;
                         goto read_again;
                 } else
                         generic_make_request(read_bio);
@@@ -1321,7 -1323,7 +1322,7 @@@
                         if (r1_bio->bios[j])
                                 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
                 r1_bio->state = 0;
-               allow_barrier(conf, start_next_window, bio->bi_sector);
+               allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector);
                 md_wait_for_blocked_rdev(blocked_rdev, mddev);
                 start_next_window = wait_barrier(conf, bio);
                 /*
@@@ -1348,7 -1350,7 +1349,7 @@@
                         bio->bi_phys_segments++;
                 spin_unlock_irq(&conf->device_lock);
         }
-       sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
+       sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
   
         atomic_set(&r1_bio->remaining, 1);
         atomic_set(&r1_bio->behind_remaining, 0);
@@@ -1360,7 -1362,7 +1361,7 @@@
                         continue;
   
                 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-               bio_trim(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
+               bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors);
   
                 if (first_clone) {
                         /* do behind I/O ?
@@@ -1394,7 -1396,7 +1395,7 @@@
   
                 r1_bio->bios[i] = mbio;
   
-               mbio->bi_sector = (r1_bio->sector +
+               mbio->bi_iter.bi_sector = (r1_bio->sector +
                                    conf->mirrors[i].rdev->data_offset);
                 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
                 mbio->bi_end_io = raid1_end_write_request;
@@@ -1434,7 -1436,7 +1435,7 @@@
                 r1_bio->sectors = bio_sectors(bio) - sectors_handled;
                 r1_bio->state = 0;
                 r1_bio->mddev = mddev;
-               r1_bio->sector = bio->bi_sector + sectors_handled;
+               r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
                 goto retry_write;
         }
   
@@@ -1958,14 -1960,14 +1959,14 @@@ static int process_checks(struct r1bio 
                 /* fixup the bio for reuse */
                 bio_reset(b);
                 b->bi_vcnt = vcnt;
-               b->bi_size = r1_bio->sectors << 9;
-               b->bi_sector = r1_bio->sector +
+               b->bi_iter.bi_size = r1_bio->sectors << 9;
+               b->bi_iter.bi_sector = r1_bio->sector +
                         conf->mirrors[i].rdev->data_offset;
                 b->bi_bdev = conf->mirrors[i].rdev->bdev;
                 b->bi_end_io = end_sync_read;
                 b->bi_private = r1_bio;
   
-               size = b->bi_size;
+               size = b->bi_iter.bi_size;
                 for (j = 0; j < vcnt ; j++) {
                         struct bio_vec *bi;
                         bi = &b->bi_io_vec[j];
@@@ -2220,11 -2222,11 +2221,11 @@@ static int narrow_write_error(struct r1
                 }
   
                 wbio->bi_rw = WRITE;
-               wbio->bi_sector = r1_bio->sector;
-               wbio->bi_size = r1_bio->sectors << 9;
+               wbio->bi_iter.bi_sector = r1_bio->sector;
+               wbio->bi_iter.bi_size = r1_bio->sectors << 9;
   
                 bio_trim(wbio, sector - r1_bio->sector, sectors);
-               wbio->bi_sector += rdev->data_offset;
+               wbio->bi_iter.bi_sector += rdev->data_offset;
                 wbio->bi_bdev = rdev->bdev;
                 if (submit_bio_wait(WRITE, wbio) == 0)
                         /* failure! */
@@@ -2338,7 -2340,8 +2339,8 @@@ read_more
                 }
                 r1_bio->read_disk = disk;
                 bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
-               bio_trim(bio, r1_bio->sector - bio->bi_sector, max_sectors);
+               bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
+                        max_sectors);
                 r1_bio->bios[r1_bio->read_disk] = bio;
                 rdev = conf->mirrors[disk].rdev;
                 printk_ratelimited(KERN_ERR
@@@ -2347,7 -2350,7 +2349,7 @@@
                                    mdname(mddev),
                                    (unsigned long long)r1_bio->sector,
                                    bdevname(rdev->bdev, b));
-               bio->bi_sector = r1_bio->sector + rdev->data_offset;
+               bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset;
                 bio->bi_bdev = rdev->bdev;
                 bio->bi_end_io = raid1_end_read_request;
                 bio->bi_rw = READ | do_sync;
@@@ -2356,7 -2359,7 +2358,7 @@@
                         /* Drat - have to split this up more */
                         struct bio *mbio = r1_bio->master_bio;
                         int sectors_handled = (r1_bio->sector + max_sectors
-                                              - mbio->bi_sector);
+                                              - mbio->bi_iter.bi_sector);
                         r1_bio->sectors = max_sectors;
                         spin_lock_irq(&conf->device_lock);
                         if (mbio->bi_phys_segments == 0)
@@@ -2374,7 -2377,8 +2376,8 @@@
                         r1_bio->state = 0;
                         set_bit(R1BIO_ReadError, &r1_bio->state);
                         r1_bio->mddev = mddev;
-                       r1_bio->sector = mbio->bi_sector + sectors_handled;
+                       r1_bio->sector = mbio->bi_iter.bi_sector +
+                               sectors_handled;
   
                         goto read_more;
                 } else
@@@ -2598,7 -2602,7 +2601,7 @@@ static sector_t sync_request(struct mdd
                 }
                 if (bio->bi_end_io) {
                         atomic_inc(&rdev->nr_pending);
-                       bio->bi_sector = sector_nr + rdev->data_offset;
+                       bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
                         bio->bi_bdev = rdev->bdev;
                         bio->bi_private = r1_bio;
                 }
@@@ -2698,7 -2702,7 +2701,7 @@@
                                                         continue;
                                                 /* remove last page from this bio */
                                                 bio->bi_vcnt--;
-                                               bio->bi_size -= len;
+                                               bio->bi_iter.bi_size -= len;
                                                 bio->bi_flags &= ~(1<< BIO_SEG_VALID);
                                         }
                                         goto bio_full;
diff --combined drivers/md/raid10.c

index 8d39d63281b9b5441b3ec8e524955356c8690871,6d43d88657aa81982e1f31540878e822ffd44a39..33fc408e5eacef0a1dce55fd5c0d578fc244b663
--- 1/drivers/md/raid10.c
--- 2/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@@ -1152,14 -1152,12 +1152,12 @@@ static void raid10_unplug(struct blk_pl
         kfree(plug);
   }
   
- static void make_request(struct mddev *mddev, struct bio * bio)
+ static void __make_request(struct mddev *mddev, struct bio *bio)
   {
         struct r10conf *conf = mddev->private;
         struct r10bio *r10_bio;
         struct bio *read_bio;
         int i;
-       sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
-       int chunk_sects = chunk_mask + 1;
         const int rw = bio_data_dir(bio);
         const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
         const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
@@@ -1174,88 -1172,27 +1172,27 @@@
         int max_sectors;
         int sectors;
   
-       if (unlikely(bio->bi_rw & REQ_FLUSH)) {
-               md_flush_request(mddev, bio);
-               return;
-       }
- 
-       /* If this request crosses a chunk boundary, we need to
-        * split it.  This will only happen for 1 PAGE (or less) requests.
-        */
-       if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
-                    > chunk_sects
-                    && (conf->geo.near_copies < conf->geo.raid_disks
-                        || conf->prev.near_copies < conf->prev.raid_disks))) {
-               struct bio_pair *bp;
-               /* Sanity check -- queue functions should prevent this happening */
-               if (bio_segments(bio) > 1)
-                       goto bad_map;
-               /* This is a one page bio that upper layers
-                * refuse to split for us, so we need to split it.
-                */
-               bp = bio_split(bio,
-                              chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
- 
-               /* Each of these 'make_request' calls will call 'wait_barrier'.
-                * If the first succeeds but the second blocks due to the resync
-                * thread raising the barrier, we will deadlock because the
-                * IO to the underlying device will be queued in generic_make_request
-                * and will never complete, so will never reduce nr_pending.
-                * So increment nr_waiting here so no new raise_barriers will
-                * succeed, and so the second wait_barrier cannot block.
-                */
-               spin_lock_irq(&conf->resync_lock);
-               conf->nr_waiting++;
-               spin_unlock_irq(&conf->resync_lock);
- 
-               make_request(mddev, &bp->bio1);
-               make_request(mddev, &bp->bio2);
- 
-               spin_lock_irq(&conf->resync_lock);
-               conf->nr_waiting--;
-               wake_up(&conf->wait_barrier);
-               spin_unlock_irq(&conf->resync_lock);
- 
-               bio_pair_release(bp);
-               return;
-       bad_map:
-               printk("md/raid10:%s: make_request bug: can't convert block across chunks"
-                      " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
-                      (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
- 
-               bio_io_error(bio);
-               return;
-       }
- 
-       md_write_start(mddev, bio);
- 
-       /*
-        * Register the new request and wait if the reconstruction
-        * thread has put up a bar for new requests.
-        * Continue immediately if no resync is active currently.
-        */
-       wait_barrier(conf);
- 
         sectors = bio_sectors(bio);
         while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-           bio->bi_sector < conf->reshape_progress &&
-           bio->bi_sector + sectors > conf->reshape_progress) {
+           bio->bi_iter.bi_sector < conf->reshape_progress &&
+           bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
                 /* IO spans the reshape position.  Need to wait for
                  * reshape to pass
                  */
                 allow_barrier(conf);
                 wait_event(conf->wait_barrier,
-                          conf->reshape_progress <= bio->bi_sector ||
-                          conf->reshape_progress >= bio->bi_sector + sectors);
+                          conf->reshape_progress <= bio->bi_iter.bi_sector ||
+                          conf->reshape_progress >= bio->bi_iter.bi_sector +
+                          sectors);
                 wait_barrier(conf);
         }
         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
             bio_data_dir(bio) == WRITE &&
             (mddev->reshape_backwards
-            ? (bio->bi_sector < conf->reshape_safe &&
-               bio->bi_sector + sectors > conf->reshape_progress)
-            : (bio->bi_sector + sectors > conf->reshape_safe &&
-               bio->bi_sector < conf->reshape_progress))) {
+            ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
+               bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
+            : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
+               bio->bi_iter.bi_sector < conf->reshape_progress))) {
                 /* Need to update reshape_position in metadata */
                 mddev->reshape_position = conf->reshape_progress;
                 set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@@ -1273,7 -1210,7 +1210,7 @@@
         r10_bio->sectors = sectors;
   
         r10_bio->mddev = mddev;
-       r10_bio->sector = bio->bi_sector;
+       r10_bio->sector = bio->bi_iter.bi_sector;
         r10_bio->state = 0;
   
         /* We might need to issue multiple reads to different
@@@ -1302,13 -1239,13 +1239,13 @@@ read_again
                 slot = r10_bio->read_slot;
   
                 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-               bio_trim(read_bio, r10_bio->sector - bio->bi_sector,
+               bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
                          max_sectors);
   
                 r10_bio->devs[slot].bio = read_bio;
                 r10_bio->devs[slot].rdev = rdev;
   
-               read_bio->bi_sector = r10_bio->devs[slot].addr +
+               read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
                         choose_data_offset(r10_bio, rdev);
                 read_bio->bi_bdev = rdev->bdev;
                 read_bio->bi_end_io = raid10_end_read_request;
@@@ -1319,15 -1256,15 +1256,15 @@@
                         /* Could not read all from this device, so we will
                          * need another r10_bio.
                          */
- -                      sectors_handled = (r10_bio->sectors + max_sectors
+ +                      sectors_handled = (r10_bio->sector + max_sectors
-                                          - bio->bi_sector);
+                                          - bio->bi_iter.bi_sector);
                         r10_bio->sectors = max_sectors;
                         spin_lock_irq(&conf->device_lock);
                         if (bio->bi_phys_segments == 0)
                                 bio->bi_phys_segments = 2;
                         else
                                 bio->bi_phys_segments++;
- -                      spin_unlock(&conf->device_lock);
+ +                      spin_unlock_irq(&conf->device_lock);
                         /* Cannot call generic_make_request directly
                          * as that will be queued in __generic_make_request
                          * and subsequent mempool_alloc might block
@@@ -1341,7 -1278,8 +1278,8 @@@
                         r10_bio->sectors = bio_sectors(bio) - sectors_handled;
                         r10_bio->state = 0;
                         r10_bio->mddev = mddev;
-                       r10_bio->sector = bio->bi_sector + sectors_handled;
+                       r10_bio->sector = bio->bi_iter.bi_sector +
+                               sectors_handled;
                         goto read_again;
                 } else
                         generic_make_request(read_bio);
@@@ -1499,7 -1437,8 +1437,8 @@@ retry_write
                         bio->bi_phys_segments++;
                 spin_unlock_irq(&conf->device_lock);
         }
-       sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
+       sectors_handled = r10_bio->sector + max_sectors -
+               bio->bi_iter.bi_sector;
   
         atomic_set(&r10_bio->remaining, 1);
         bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
@@@ -1510,11 -1449,11 +1449,11 @@@
                 if (r10_bio->devs[i].bio) {
                         struct md_rdev *rdev = conf->mirrors[d].rdev;
                         mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-                       bio_trim(mbio, r10_bio->sector - bio->bi_sector,
+                       bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
                                  max_sectors);
                         r10_bio->devs[i].bio = mbio;
   
-                       mbio->bi_sector = (r10_bio->devs[i].addr+
+                       mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
                                            choose_data_offset(r10_bio,
                                                               rdev));
                         mbio->bi_bdev = rdev->bdev;
@@@ -1553,11 -1492,11 +1492,11 @@@
                                 rdev = conf->mirrors[d].rdev;
                         }
                         mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-                       bio_trim(mbio, r10_bio->sector - bio->bi_sector,
+                       bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
                                  max_sectors);
                         r10_bio->devs[i].repl_bio = mbio;
   
-                       mbio->bi_sector = (r10_bio->devs[i].addr +
+                       mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr +
                                            choose_data_offset(
                                                    r10_bio, rdev));
                         mbio->bi_bdev = rdev->bdev;
@@@ -1591,11 -1530,57 +1530,57 @@@
                 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
   
                 r10_bio->mddev = mddev;
-               r10_bio->sector = bio->bi_sector + sectors_handled;
+               r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
                 r10_bio->state = 0;
                 goto retry_write;
         }
         one_write_done(r10_bio);
+ }
+ 
+ static void make_request(struct mddev *mddev, struct bio *bio)
+ {
+       struct r10conf *conf = mddev->private;
+       sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
+       int chunk_sects = chunk_mask + 1;
+ 
+       struct bio *split;
+ 
+       if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+               md_flush_request(mddev, bio);
+               return;
+       }
+ 
+       md_write_start(mddev, bio);
+ 
+       /*
+        * Register the new request and wait if the reconstruction
+        * thread has put up a bar for new requests.
+        * Continue immediately if no resync is active currently.
+        */
+       wait_barrier(conf);
+ 
+       do {
+ 
+               /*
+                * If this request crosses a chunk boundary, we need to split
+                * it.
+                */
+               if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
+                            bio_sectors(bio) > chunk_sects
+                            && (conf->geo.near_copies < conf->geo.raid_disks
+                                || conf->prev.near_copies <
+                                conf->prev.raid_disks))) {
+                       split = bio_split(bio, chunk_sects -
+                                         (bio->bi_iter.bi_sector &
+                                          (chunk_sects - 1)),
+                                         GFP_NOIO, fs_bio_set);
+                       bio_chain(split, bio);
+               } else {
+                       split = bio;
+               }
+ 
+               __make_request(mddev, split);
+       } while (split != bio);
   
         /* In case raid10d snuck in to freeze_array */
         wake_up(&conf->wait_barrier);
@@@ -2124,10 -2109,10 +2109,10 @@@ static void sync_request_write(struct m
                 bio_reset(tbio);
   
                 tbio->bi_vcnt = vcnt;
-               tbio->bi_size = r10_bio->sectors << 9;
+               tbio->bi_iter.bi_size = r10_bio->sectors << 9;
                 tbio->bi_rw = WRITE;
                 tbio->bi_private = r10_bio;
-               tbio->bi_sector = r10_bio->devs[i].addr;
+               tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
   
                 for (j=0; j < vcnt ; j++) {
                         tbio->bi_io_vec[j].bv_offset = 0;
@@@ -2144,7 -2129,7 +2129,7 @@@
                 atomic_inc(&r10_bio->remaining);
                 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
   
-               tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
+               tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
                 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
                 generic_make_request(tbio);
         }
@@@ -2614,8 -2599,8 +2599,8 @@@ static int narrow_write_error(struct r1
                         sectors = sect_to_write;
                 /* Write at 'sector' for 'sectors' */
                 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-               bio_trim(wbio, sector - bio->bi_sector, sectors);
-               wbio->bi_sector = (r10_bio->devs[i].addr+
+               bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
+               wbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
                                    choose_data_offset(r10_bio, rdev) +
                                    (sector - r10_bio->sector));
                 wbio->bi_bdev = rdev->bdev;
@@@ -2687,10 -2672,10 +2672,10 @@@ read_more
                 (unsigned long long)r10_bio->sector);
         bio = bio_clone_mddev(r10_bio->master_bio,
                               GFP_NOIO, mddev);
-       bio_trim(bio, r10_bio->sector - bio->bi_sector, max_sectors);
+       bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
         r10_bio->devs[slot].bio = bio;
         r10_bio->devs[slot].rdev = rdev;
-       bio->bi_sector = r10_bio->devs[slot].addr
+       bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
                 + choose_data_offset(r10_bio, rdev);
         bio->bi_bdev = rdev->bdev;
         bio->bi_rw = READ | do_sync;
@@@ -2701,7 -2686,7 +2686,7 @@@
                 struct bio *mbio = r10_bio->master_bio;
                 int sectors_handled =
                         r10_bio->sector + max_sectors
-                       - mbio->bi_sector;
+                       - mbio->bi_iter.bi_sector;
                 r10_bio->sectors = max_sectors;
                 spin_lock_irq(&conf->device_lock);
                 if (mbio->bi_phys_segments == 0)
@@@ -2719,7 -2704,7 +2704,7 @@@
                 set_bit(R10BIO_ReadError,
                         &r10_bio->state);
                 r10_bio->mddev = mddev;
-               r10_bio->sector = mbio->bi_sector
+               r10_bio->sector = mbio->bi_iter.bi_sector
                         + sectors_handled;
   
                 goto read_more;
@@@ -3157,7 -3142,8 +3142,8 @@@ static sector_t sync_request(struct mdd
                                 bio->bi_end_io = end_sync_read;
                                 bio->bi_rw = READ;
                                 from_addr = r10_bio->devs[j].addr;
-                               bio->bi_sector = from_addr + rdev->data_offset;
+                               bio->bi_iter.bi_sector = from_addr +
+                                       rdev->data_offset;
                                 bio->bi_bdev = rdev->bdev;
                                 atomic_inc(&rdev->nr_pending);
                                 /* and we write to 'i' (if not in_sync) */
@@@ -3181,7 -3167,7 +3167,7 @@@
                                         bio->bi_private = r10_bio;
                                         bio->bi_end_io = end_sync_write;
                                         bio->bi_rw = WRITE;
-                                       bio->bi_sector = to_addr
+                                       bio->bi_iter.bi_sector = to_addr
                                                 + rdev->data_offset;
                                         bio->bi_bdev = rdev->bdev;
                                         atomic_inc(&r10_bio->remaining);
@@@ -3210,7 -3196,8 +3196,8 @@@
                                 bio->bi_private = r10_bio;
                                 bio->bi_end_io = end_sync_write;
                                 bio->bi_rw = WRITE;
-                               bio->bi_sector = to_addr + rdev->data_offset;
+                               bio->bi_iter.bi_sector = to_addr +
+                                       rdev->data_offset;
                                 bio->bi_bdev = rdev->bdev;
                                 atomic_inc(&r10_bio->remaining);
                                 break;
@@@ -3218,6 -3205,10 +3205,6 @@@
                         if (j == conf->copies) {
                                 /* Cannot recover, so abort the recovery or
                                  * record a bad block */
- -                              put_buf(r10_bio);
- -                              if (rb2)
- -                                      atomic_dec(&rb2->remaining);
- -                              r10_bio = rb2;
                                 if (any_working) {
                                         /* problem is that there are bad blocks
                                          * on other device(s)
@@@ -3249,10 -3240,6 +3236,10 @@@
                                         mirror->recovery_disabled
                                                 = mddev->recovery_disabled;
                                 }
+ +                              put_buf(r10_bio);
+ +                              if (rb2)
+ +                                      atomic_dec(&rb2->remaining);
+ +                              r10_bio = rb2;
                                 break;
                         }
                 }
@@@ -3328,7 -3315,7 +3315,7 @@@
                         bio->bi_private = r10_bio;
                         bio->bi_end_io = end_sync_read;
                         bio->bi_rw = READ;
-                       bio->bi_sector = sector +
+                       bio->bi_iter.bi_sector = sector +
                                 conf->mirrors[d].rdev->data_offset;
                         bio->bi_bdev = conf->mirrors[d].rdev->bdev;
                         count++;
@@@ -3350,7 -3337,7 +3337,7 @@@
                         bio->bi_private = r10_bio;
                         bio->bi_end_io = end_sync_write;
                         bio->bi_rw = WRITE;
-                       bio->bi_sector = sector +
+                       bio->bi_iter.bi_sector = sector +
                                 conf->mirrors[d].replacement->data_offset;
                         bio->bi_bdev = conf->mirrors[d].replacement->bdev;
                         count++;
@@@ -3397,7 -3384,7 +3384,7 @@@
                              bio2 = bio2->bi_next) {
                                 /* remove last page from this bio */
                                 bio2->bi_vcnt--;
-                               bio2->bi_size -= len;
+                               bio2->bi_iter.bi_size -= len;
                                 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
                         }
                         goto bio_full;
@@@ -3747,8 -3734,7 +3734,8 @@@ static int run(struct mddev *mddev
                     !test_bit(In_sync, &disk->rdev->flags)) {
                         disk->head_position = 0;
                         mddev->degraded++;
- -                      if (disk->rdev)
+ +                      if (disk->rdev &&
+ +                          disk->rdev->saved_raid_disk < 0)
                                 conf->fullsync = 1;
                 }
                 disk->recovery_disabled = mddev->recovery_disabled - 1;
@@@ -4418,7 -4404,7 +4405,7 @@@ read_more
         read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
   
         read_bio->bi_bdev = rdev->bdev;
-       read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
+       read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
                                + rdev->data_offset);
         read_bio->bi_private = r10_bio;
         read_bio->bi_end_io = end_sync_read;
@@@ -4426,7 -4412,7 +4413,7 @@@
         read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
         read_bio->bi_flags |= 1 << BIO_UPTODATE;
         read_bio->bi_vcnt = 0;
-       read_bio->bi_size = 0;
+       read_bio->bi_iter.bi_size = 0;
         r10_bio->master_bio = read_bio;
         r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
   
@@@ -4452,7 -4438,8 +4439,8 @@@
   
                 bio_reset(b);
                 b->bi_bdev = rdev2->bdev;
-               b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
+               b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
+                       rdev2->new_data_offset;
                 b->bi_private = r10_bio;
                 b->bi_end_io = end_reshape_write;
                 b->bi_rw = WRITE;
@@@ -4479,7 -4466,7 +4467,7 @@@
                              bio2 = bio2->bi_next) {
                                 /* Remove last page from this bio */
                                 bio2->bi_vcnt--;
-                               bio2->bi_size -= len;
+                               bio2->bi_iter.bi_size -= len;
                                 bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
                         }
                         goto bio_full;
diff --combined drivers/md/raid5.c

index 03f82ab87d9e73eb4fed4ede052c95fa5d891f09,eea63372e4d30533b2255159c8b428b2ad90acb3..67ca9c3d2939c5e4468d51f0ea0454dfdceac731
--- 1/drivers/md/raid5.c
--- 2/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@@ -133,7 -133,7 +133,7 @@@ static inline void unlock_all_device_ha
   static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
   {
         int sectors = bio_sectors(bio);
-       if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
+       if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
                 return bio->bi_next;
         else
                 return NULL;
@@@ -225,7 -225,7 +225,7 @@@ static void return_io(struct bio *retur
   
                 return_bi = bi->bi_next;
                 bi->bi_next = NULL;
-               bi->bi_size = 0;
+               bi->bi_iter.bi_size = 0;
                 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
                                          bi, 0);
                 bio_endio(bi, 0);
@@@ -675,10 -675,8 +675,10 @@@ get_active_stripe(struct r5conf *conf, 
                                          || !conf->inactive_blocked),
                                         *(conf->hash_locks + hash));
                                 conf->inactive_blocked = 0;
- -                      } else
+ +                      } else {
                                 init_stripe(sh, sector, previous);
+ +                              atomic_inc(&sh->count);
+ +                      }
                 } else {
                         spin_lock(&conf->device_lock);
                         if (atomic_read(&sh->count)) {
@@@ -689,19 -687,20 +689,19 @@@
                         } else {
                                 if (!test_bit(STRIPE_HANDLE, &sh->state))
                                         atomic_inc(&conf->active_stripes);
- -                              BUG_ON(list_empty(&sh->lru));
+ +                              BUG_ON(list_empty(&sh->lru) &&
+ +                                     !test_bit(STRIPE_EXPANDING, &sh->state));
                                 list_del_init(&sh->lru);
                                 if (sh->group) {
                                         sh->group->stripes_cnt--;
                                         sh->group = NULL;
                                 }
                         }
+ +                      atomic_inc(&sh->count);
                         spin_unlock(&conf->device_lock);
                 }
         } while (sh == NULL);
   
- -      if (sh)
- -              atomic_inc(&sh->count);
- -
         spin_unlock_irq(conf->hash_locks + hash);
         return sh;
   }
@@@ -852,10 -851,10 +852,10 @@@ static void ops_run_io(struct stripe_he
                                 bi->bi_rw, i);
                         atomic_inc(&sh->count);
                         if (use_new_offset(conf, sh))
-                               bi->bi_sector = (sh->sector
+                               bi->bi_iter.bi_sector = (sh->sector
                                                  + rdev->new_data_offset);
                         else
-                               bi->bi_sector = (sh->sector
+                               bi->bi_iter.bi_sector = (sh->sector
                                                  + rdev->data_offset);
                         if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
                                 bi->bi_rw |= REQ_NOMERGE;
@@@ -863,7 -862,7 +863,7 @@@
                         bi->bi_vcnt = 1;
                         bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                         bi->bi_io_vec[0].bv_offset = 0;
-                       bi->bi_size = STRIPE_SIZE;
+                       bi->bi_iter.bi_size = STRIPE_SIZE;
                         /*
                          * If this is discard request, set bi_vcnt 0. We don't
                          * want to confuse SCSI because SCSI will replace payload
@@@ -899,15 -898,15 +899,15 @@@
                                 rbi->bi_rw, i);
                         atomic_inc(&sh->count);
                         if (use_new_offset(conf, sh))
-                               rbi->bi_sector = (sh->sector
+                               rbi->bi_iter.bi_sector = (sh->sector
                                                   + rrdev->new_data_offset);
                         else
-                               rbi->bi_sector = (sh->sector
+                               rbi->bi_iter.bi_sector = (sh->sector
                                                   + rrdev->data_offset);
                         rbi->bi_vcnt = 1;
                         rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                         rbi->bi_io_vec[0].bv_offset = 0;
-                       rbi->bi_size = STRIPE_SIZE;
+                       rbi->bi_iter.bi_size = STRIPE_SIZE;
                         /*
                          * If this is discard request, set bi_vcnt 0. We don't
                          * want to confuse SCSI because SCSI will replace payload
@@@ -935,24 -934,24 +935,24 @@@ static struct dma_async_tx_descriptor 
   async_copy_data(int frombio, struct bio *bio, struct page *page,
         sector_t sector, struct dma_async_tx_descriptor *tx)
   {
-       struct bio_vec *bvl;
+       struct bio_vec bvl;
+       struct bvec_iter iter;
         struct page *bio_page;
-       int i;
         int page_offset;
         struct async_submit_ctl submit;
         enum async_tx_flags flags = 0;
   
-       if (bio->bi_sector >= sector)
-               page_offset = (signed)(bio->bi_sector - sector) * 512;
+       if (bio->bi_iter.bi_sector >= sector)
+               page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
         else
-               page_offset = (signed)(sector - bio->bi_sector) * -512;
+               page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
   
         if (frombio)
                 flags |= ASYNC_TX_FENCE;
         init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
   
-       bio_for_each_segment(bvl, bio, i) {
-               int len = bvl->bv_len;
+       bio_for_each_segment(bvl, bio, iter) {
+               int len = bvl.bv_len;
                 int clen;
                 int b_offset = 0;
   
@@@ -968,8 -967,8 +968,8 @@@
                         clen = len;
   
                 if (clen > 0) {
-                       b_offset += bvl->bv_offset;
-                       bio_page = bvl->bv_page;
+                       b_offset += bvl.bv_offset;
+                       bio_page = bvl.bv_page;
                         if (frombio)
                                 tx = async_memcpy(page, bio_page, page_offset,
                                                   b_offset, clen, &submit);
@@@ -1012,7 -1011,7 +1012,7 @@@ static void ops_complete_biofill(void *
                         BUG_ON(!dev->read);
                         rbi = dev->read;
                         dev->read = NULL;
-                       while (rbi && rbi->bi_sector <
+                       while (rbi && rbi->bi_iter.bi_sector <
                                 dev->sector + STRIPE_SECTORS) {
                                 rbi2 = r5_next_bio(rbi, dev->sector);
                                 if (!raid5_dec_bi_active_stripes(rbi)) {
@@@ -1048,7 -1047,7 +1048,7 @@@ static void ops_run_biofill(struct stri
                         dev->read = rbi = dev->toread;
                         dev->toread = NULL;
                         spin_unlock_irq(&sh->stripe_lock);
-                       while (rbi && rbi->bi_sector <
+                       while (rbi && rbi->bi_iter.bi_sector <
                                 dev->sector + STRIPE_SECTORS) {
                                 tx = async_copy_data(0, rbi, dev->page,
                                         dev->sector, tx);
@@@ -1390,7 -1389,7 +1390,7 @@@ ops_run_biodrain(struct stripe_head *sh
                         wbi = dev->written = chosen;
                         spin_unlock_irq(&sh->stripe_lock);
   
-                       while (wbi && wbi->bi_sector <
+                       while (wbi && wbi->bi_iter.bi_sector <
                                 dev->sector + STRIPE_SECTORS) {
                                 if (wbi->bi_rw & REQ_FUA)
                                         set_bit(R5_WantFUA, &dev->flags);
@@@ -2111,7 -2110,6 +2111,7 @@@ static void raid5_end_write_request(str
                         set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
         } else {
                 if (!uptodate) {
+ +                      set_bit(STRIPE_DEGRADED, &sh->state);
                         set_bit(WriteErrorSeen, &rdev->flags);
                         set_bit(R5_WriteError, &sh->dev[i].flags);
                         if (!test_and_set_bit(WantReplacement, &rdev->flags))
@@@ -2615,7 -2613,7 +2615,7 @@@ static int add_stripe_bio(struct stripe
         int firstwrite=0;
   
         pr_debug("adding bi b#%llu to stripe s#%llu\n",
-               (unsigned long long)bi->bi_sector,
+               (unsigned long long)bi->bi_iter.bi_sector,
                 (unsigned long long)sh->sector);
   
         /*
@@@ -2633,12 -2631,12 +2633,12 @@@
                         firstwrite = 1;
         } else
                 bip = &sh->dev[dd_idx].toread;
-       while (*bip && (*bip)->bi_sector < bi->bi_sector) {
-               if (bio_end_sector(*bip) > bi->bi_sector)
+       while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
+               if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
                         goto overlap;
                 bip = & (*bip)->bi_next;
         }
-       if (*bip && (*bip)->bi_sector < bio_end_sector(bi))
+       if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
                 goto overlap;
   
         BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
@@@ -2652,7 -2650,7 +2652,7 @@@
                 sector_t sector = sh->dev[dd_idx].sector;
                 for (bi=sh->dev[dd_idx].towrite;
                      sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
-                            bi && bi->bi_sector <= sector;
+                            bi && bi->bi_iter.bi_sector <= sector;
                      bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
                         if (bio_end_sector(bi) >= sector)
                                 sector = bio_end_sector(bi);
@@@ -2662,7 -2660,7 +2662,7 @@@
         }
   
         pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
-               (unsigned long long)(*bip)->bi_sector,
+               (unsigned long long)(*bip)->bi_iter.bi_sector,
                 (unsigned long long)sh->sector, dd_idx);
         spin_unlock_irq(&sh->stripe_lock);
   
@@@ -2737,7 -2735,7 +2737,7 @@@ handle_failed_stripe(struct r5conf *con
                 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
                         wake_up(&conf->wait_for_overlap);
   
-               while (bi && bi->bi_sector <
+               while (bi && bi->bi_iter.bi_sector <
                         sh->dev[i].sector + STRIPE_SECTORS) {
                         struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
                         clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@@ -2756,7 -2754,7 +2756,7 @@@
                 bi = sh->dev[i].written;
                 sh->dev[i].written = NULL;
                 if (bi) bitmap_end = 1;
-               while (bi && bi->bi_sector <
+               while (bi && bi->bi_iter.bi_sector <
                        sh->dev[i].sector + STRIPE_SECTORS) {
                         struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
                         clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@@ -2780,7 -2778,7 +2780,7 @@@
                         spin_unlock_irq(&sh->stripe_lock);
                         if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
                                 wake_up(&conf->wait_for_overlap);
-                       while (bi && bi->bi_sector <
+                       while (bi && bi->bi_iter.bi_sector <
                                sh->dev[i].sector + STRIPE_SECTORS) {
                                 struct bio *nextbi =
                                         r5_next_bio(bi, sh->dev[i].sector);
@@@ -3004,7 -3002,7 +3004,7 @@@ static void handle_stripe_clean_event(s
                                         clear_bit(R5_UPTODATE, &dev->flags);
                                 wbi = dev->written;
                                 dev->written = NULL;
-                               while (wbi && wbi->bi_sector <
+                               while (wbi && wbi->bi_iter.bi_sector <
                                         dev->sector + STRIPE_SECTORS) {
                                         wbi2 = r5_next_bio(wbi, dev->sector);
                                         if (!raid5_dec_bi_active_stripes(wbi)) {
@@@ -3610,7 -3608,7 +3610,7 @@@ static void analyse_stripe(struct strip
                          */
                         set_bit(R5_Insync, &dev->flags);
   
- -              if (rdev && test_bit(R5_WriteError, &dev->flags)) {
+ +              if (test_bit(R5_WriteError, &dev->flags)) {
                         /* This flag does not apply to '.replacement'
                          * only to .rdev, so make sure to check that*/
                         struct md_rdev *rdev2 = rcu_dereference(
@@@ -3623,7 -3621,7 +3623,7 @@@
                         } else
                                 clear_bit(R5_WriteError, &dev->flags);
                 }
- -              if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
+ +              if (test_bit(R5_MadeGood, &dev->flags)) {
                         /* This flag does not apply to '.replacement'
                          * only to .rdev, so make sure to check that*/
                         struct md_rdev *rdev2 = rcu_dereference(
@@@ -4096,7 -4094,7 +4096,7 @@@ static int raid5_mergeable_bvec(struct 
   
   static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
   {
-       sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+       sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
         unsigned int chunk_sectors = mddev->chunk_sectors;
         unsigned int bio_sectors = bio_sectors(bio);
   
@@@ -4233,9 -4231,9 +4233,9 @@@ static int chunk_aligned_read(struct md
         /*
          *      compute position
          */
-       align_bi->bi_sector =  raid5_compute_sector(conf, raid_bio->bi_sector,
-                                                   0,
-                                                   &dd_idx, NULL);
+       align_bi->bi_iter.bi_sector =
+               raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
+                                    0, &dd_idx, NULL);
   
         end_sector = bio_end_sector(align_bi);
         rcu_read_lock();
@@@ -4260,7 -4258,8 +4260,8 @@@
                 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
   
                 if (!bio_fits_rdev(align_bi) ||
-                   is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi),
+                   is_badblock(rdev, align_bi->bi_iter.bi_sector,
+                               bio_sectors(align_bi),
                                 &first_bad, &bad_sectors)) {
                         /* too big in some way, or has a known bad block */
                         bio_put(align_bi);
@@@ -4269,7 -4268,7 +4270,7 @@@
                 }
   
                 /* No reshape active, so we can trust rdev->data_offset */
-               align_bi->bi_sector += rdev->data_offset;
+               align_bi->bi_iter.bi_sector += rdev->data_offset;
   
                 spin_lock_irq(&conf->device_lock);
                 wait_event_lock_irq(conf->wait_for_stripe,
@@@ -4281,7 -4280,7 +4282,7 @@@
                 if (mddev->gendisk)
                         trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
                                               align_bi, disk_devt(mddev->gendisk),
-                                             raid_bio->bi_sector);
+                                             raid_bio->bi_iter.bi_sector);
                 generic_make_request(align_bi);
                 return 1;
         } else {
@@@ -4464,8 -4463,8 +4465,8 @@@ static void make_discard_request(struc
                 /* Skip discard while reshape is happening */
                 return;
   
-       logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-       last_sector = bi->bi_sector + (bi->bi_size>>9);
+       logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+       last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
   
         bi->bi_next = NULL;
         bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
@@@ -4569,7 -4568,7 +4570,7 @@@ static void make_request(struct mddev *
                 return;
         }
   
-       logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+       logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
         last_sector = bio_end_sector(bi);
         bi->bi_next = NULL;
         bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
@@@ -5053,7 -5052,8 +5054,8 @@@ static int  retry_aligned_read(struct r
         int remaining;
         int handled = 0;
   
-       logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+       logical_sector = raid_bio->bi_iter.bi_sector &
+               ~((sector_t)STRIPE_SECTORS-1);
         sector = raid5_compute_sector(conf, logical_sector,
                                       0, &dd_idx, NULL);
         last_sector = bio_end_sector(raid_bio);
diff --combined drivers/s390/block/xpram.c

index 58141f0651f280b4cc48b510f09455edf8b6cc0a,3e530f9da8c48a42a2c11b1e4c58b04b284d161c..6969d39f1e2eba7de41856cabc0d1557b7f3efe4
--- 1/drivers/s390/block/xpram.c
--- 2/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@@ -184,25 -184,26 +184,26 @@@ static unsigned long xpram_highest_page
   static void xpram_make_request(struct request_queue *q, struct bio *bio)
   {
         xpram_device_t *xdev = bio->bi_bdev->bd_disk->private_data;
-       struct bio_vec *bvec;
+       struct bio_vec bvec;
+       struct bvec_iter iter;
         unsigned int index;
         unsigned long page_addr;
         unsigned long bytes;
-       int i;
   
-       if ((bio->bi_sector & 7) != 0 || (bio->bi_size & 4095) != 0)
+       if ((bio->bi_iter.bi_sector & 7) != 0 ||
+           (bio->bi_iter.bi_size & 4095) != 0)
                 /* Request is not page-aligned. */
                 goto fail;
-       if ((bio->bi_size >> 12) > xdev->size)
+       if ((bio->bi_iter.bi_size >> 12) > xdev->size)
                 /* Request size is no page-aligned. */
                 goto fail;
-       if ((bio->bi_sector >> 3) > 0xffffffffU - xdev->offset)
+       if ((bio->bi_iter.bi_sector >> 3) > 0xffffffffU - xdev->offset)
                 goto fail;
-       index = (bio->bi_sector >> 3) + xdev->offset;
-       bio_for_each_segment(bvec, bio, i) {
+       index = (bio->bi_iter.bi_sector >> 3) + xdev->offset;
+       bio_for_each_segment(bvec, bio, iter) {
                 page_addr = (unsigned long)
-                       kmap(bvec->bv_page) + bvec->bv_offset;
-               bytes = bvec->bv_len;
+                       kmap(bvec.bv_page) + bvec.bv_offset;
+               bytes = bvec.bv_len;
                 if ((page_addr & 4095) != 0 || (bytes & 4095) != 0)
                         /* More paranoia. */
                         goto fail;
@@@ -257,7 -258,6 +258,7 @@@ static int __init xpram_setup_sizes(uns
         unsigned long mem_needed;
         unsigned long mem_auto;
         unsigned long long size;
+ +      char *sizes_end;
         int mem_auto_no;
         int i;
   
@@@ -276,8 -276,8 +277,8 @@@
         mem_auto_no = 0;
         for (i = 0; i < xpram_devs; i++) {
                 if (sizes[i]) {
- -                      size = simple_strtoull(sizes[i], &sizes[i], 0);
- -                      switch (sizes[i][0]) {
+ +                      size = simple_strtoull(sizes[i], &sizes_end, 0);
+ +                      switch (*sizes_end) {
                         case 'g':
                         case 'G':
                                 size <<= 20;
diff --combined drivers/scsi/sd.c

index 9846c6ab2aaa92eeab130a92fe4d7b8d539b624c,5c8a3b696a1dbf3ab18f914225e36f7c4d0a3a31..470954aba7289a758a650cd82b2f1dfe50ae54f1
--- 1/drivers/scsi/sd.c
--- 2/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@@ -110,7 -110,7 +110,7 @@@ static int sd_suspend_runtime(struct de
   static int sd_resume(struct device *);
   static void sd_rescan(struct device *);
   static int sd_done(struct scsi_cmnd *);
- -static int sd_eh_action(struct scsi_cmnd *, unsigned char *, int, int);
+ +static int sd_eh_action(struct scsi_cmnd *, int);
   static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer);
   static void scsi_disk_release(struct device *cdev);
   static void sd_print_sense_hdr(struct scsi_disk *, struct scsi_sense_hdr *);
@@@ -801,7 -801,7 +801,7 @@@ static int sd_setup_write_same_cmnd(str
         if (sdkp->device->no_write_same)
                 return BLKPREP_KILL;
   
-       BUG_ON(bio_offset(bio) || bio_iovec(bio)->bv_len != sdp->sector_size);
+       BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
   
         sector >>= ilog2(sdp->sector_size) - 9;
         nr_sectors >>= ilog2(sdp->sector_size) - 9;
@@@ -1551,23 -1551,23 +1551,23 @@@ static const struct block_device_operat
   /**
    *    sd_eh_action - error handling callback
    *    @scmd:          sd-issued command that has failed
- - *    @eh_cmnd:       The command that was sent during error handling
- - *    @eh_cmnd_len:   Length of eh_cmnd in bytes
    *    @eh_disp:       The recovery disposition suggested by the midlayer
    *
- - *    This function is called by the SCSI midlayer upon completion of
- - *    an error handling command (TEST UNIT READY, START STOP UNIT,
- - *    etc.) The command sent to the device by the error handler is
- - *    stored in eh_cmnd. The result of sending the eh command is
- - *    passed in eh_disp.
+ + *    This function is called by the SCSI midlayer upon completion of an
+ + *    error test command (currently TEST UNIT READY). The result of sending
+ + *    the eh command is passed in eh_disp.  We're looking for devices that
+ + *    fail medium access commands but are OK with non access commands like
+ + *    test unit ready (so wrongly see the device as having a successful
+ + *    recovery)
    **/
- -static int sd_eh_action(struct scsi_cmnd *scmd, unsigned char *eh_cmnd,
- -                      int eh_cmnd_len, int eh_disp)
+ +static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp)
   {
         struct scsi_disk *sdkp = scsi_disk(scmd->request->rq_disk);
   
         if (!scsi_device_online(scmd->device) ||
- -          !scsi_medium_access_command(scmd))
+ +          !scsi_medium_access_command(scmd) ||
+ +          host_byte(scmd->result) != DID_TIME_OUT ||
+ +          eh_disp != SUCCESS)
                 return eh_disp;
   
         /*
@@@ -1577,7 -1577,9 +1577,7 @@@
          * process of recovering or has it suffered an internal failure
          * that prevents access to the storage medium.
          */
- -      if (host_byte(scmd->result) == DID_TIME_OUT && eh_disp == SUCCESS &&
- -          eh_cmnd_len && eh_cmnd[0] == TEST_UNIT_READY)
- -              sdkp->medium_access_timed_out++;
+ +      sdkp->medium_access_timed_out++;
   
         /*
          * If the device keeps failing read/write commands but TEST UNIT
@@@ -1626,7 -1628,7 +1626,7 @@@ static unsigned int sd_completed_bytes(
                 end_lba <<= 1;
         } else {
                 /* be careful ... don't want any overflows */
- -              u64 factor = scmd->device->sector_size / 512;
+ +              unsigned int factor = scmd->device->sector_size / 512;
                 do_div(start_lba, factor);
                 do_div(end_lba, factor);
         }
diff --combined drivers/staging/lustre/lustre/llite/lloop.c

index 5338e8d4c50fa998582fb86209f66c95a11419a8,581ff78be1a2a4b6e19d39d0cd36e5075fd997c0..0718905adeb256cb2a2dd12336f3dbb7db365d23
--- 1/drivers/staging/lustre/lustre/llite/lloop.c
--- 2/drivers/staging/lustre/lustre/llite/lloop.c
+++ b/drivers/staging/lustre/lustre/llite/lloop.c
@@@ -194,10 -194,10 +194,10 @@@ static int do_bio_lustrebacked(struct l
         struct cl_object     *obj = ll_i2info(inode)->lli_clob;
         pgoff_t        offset;
         int                ret;
-       int                i;
         int                rw;
         obd_count            page_count = 0;
-       struct bio_vec       *bvec;
+       struct bio_vec       bvec;
+       struct bvec_iter   iter;
         struct bio         *bio;
         ssize_t        bytes;
   
@@@ -220,15 -220,15 +220,15 @@@
         for (bio = head; bio != NULL; bio = bio->bi_next) {
                 LASSERT(rw == bio->bi_rw);
   
-               offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset;
-               bio_for_each_segment(bvec, bio, i) {
-                       BUG_ON(bvec->bv_offset != 0);
-                       BUG_ON(bvec->bv_len != PAGE_CACHE_SIZE);
+               offset = (pgoff_t)(bio->bi_iter.bi_sector << 9) + lo->lo_offset;
+               bio_for_each_segment(bvec, bio, iter) {
+                       BUG_ON(bvec.bv_offset != 0);
+                       BUG_ON(bvec.bv_len != PAGE_CACHE_SIZE);
   
-                       pages[page_count] = bvec->bv_page;
+                       pages[page_count] = bvec.bv_page;
                         offsets[page_count] = offset;
                         page_count++;
-                       offset += bvec->bv_len;
+                       offset += bvec.bv_len;
                 }
                 LASSERT(page_count <= LLOOP_MAX_SEGMENTS);
         }
@@@ -313,7 -313,8 +313,8 @@@ static unsigned int loop_get_bio(struc
         bio = &lo->lo_bio;
         while (*bio && (*bio)->bi_rw == rw) {
                 CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n",
-                      (unsigned long long)(*bio)->bi_sector, (*bio)->bi_size,
+                      (unsigned long long)(*bio)->bi_iter.bi_sector,
+                      (*bio)->bi_iter.bi_size,
                        page_count, (*bio)->bi_vcnt);
                 if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS)
                         break;
@@@ -347,7 -348,8 +348,8 @@@ static void loop_make_request(struct re
                 goto err;
   
         CDEBUG(D_INFO, "submit bio sector %llu size %u\n",
-              (unsigned long long)old_bio->bi_sector, old_bio->bi_size);
+              (unsigned long long)old_bio->bi_iter.bi_sector,
+              old_bio->bi_iter.bi_size);
   
         spin_lock_irq(&lo->lo_lock);
         inactive = (lo->lo_state != LLOOP_BOUND);
@@@ -367,7 -369,7 +369,7 @@@
         loop_add_bio(lo, old_bio);
         return;
   err:
-       cfs_bio_io_error(old_bio, old_bio->bi_size);
+       cfs_bio_io_error(old_bio, old_bio->bi_iter.bi_size);
   }
   
   
@@@ -378,7 -380,7 +380,7 @@@ static inline void loop_handle_bio(stru
         while (bio) {
                 struct bio *tmp = bio->bi_next;
                 bio->bi_next = NULL;
-               cfs_bio_endio(bio, bio->bi_size, ret);
+               cfs_bio_endio(bio, bio->bi_iter.bi_size, ret);
                 bio = tmp;
         }
   }
@@@ -856,8 -858,7 +858,8 @@@ static void lloop_exit(void
   module_init(lloop_init);
   module_exit(lloop_exit);
   
- -CFS_MODULE_PARM(max_loop, "i", int, 0444, "maximum of lloop_device");
+ +module_param(max_loop, int, 0444);
+ +MODULE_PARM_DESC(max_loop, "maximum of lloop_device");
   MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
   MODULE_DESCRIPTION("Lustre virtual block device");
   MODULE_LICENSE("GPL");
diff --combined fs/btrfs/inode.c

index 514b291b135405dd1fbd21f9a8e4edc1b161f5af,7ab0e94ad49244e6167e2c758383c78c233f0cbb..d546d8c3038baa4451aa2f338a0c24592a3ea48f
--- 1/fs/btrfs/inode.c
--- 2/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -1577,7 -1577,7 +1577,7 @@@ int btrfs_merge_bio_hook(int rw, struc
                          unsigned long bio_flags)
   {
         struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-       u64 logical = (u64)bio->bi_sector << 9;
+       u64 logical = (u64)bio->bi_iter.bi_sector << 9;
         u64 length = 0;
         u64 map_length;
         int ret;
@@@ -1585,7 -1585,7 +1585,7 @@@
         if (bio_flags & EXTENT_BIO_COMPRESSED)
                 return 0;
   
-       length = bio->bi_size;
+       length = bio->bi_iter.bi_size;
         map_length = length;
         ret = btrfs_map_block(root->fs_info, rw, logical,
                               &map_length, NULL, 0);
@@@ -4354,12 -4354,8 +4354,12 @@@ static int btrfs_setsize(struct inode *
          * these flags set.  For all other operations the VFS set these flags
          * explicitly if it wants a timestamp update.
          */
- -      if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
- -              inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
+ +      if (newsize != oldsize) {
+ +              inode_inc_iversion(inode);
+ +              if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
+ +                      inode->i_ctime = inode->i_mtime =
+ +                              current_fs_time(inode->i_sb);
+ +      }
   
         if (newsize > oldsize) {
                 truncate_pagecache(inode, newsize);
@@@ -4468,7 -4464,7 +4468,7 @@@ static int btrfs_setattr(struct dentry 
                 err = btrfs_dirty_inode(inode);
   
                 if (!err && attr->ia_valid & ATTR_MODE)
- -                      err = btrfs_acl_chmod(inode);
+ +                      err = posix_acl_chmod(inode, inode->i_mode);
         }
   
         return err;
@@@ -6783,17 -6779,16 +6783,16 @@@ unlock_err
   static void btrfs_endio_direct_read(struct bio *bio, int err)
   {
         struct btrfs_dio_private *dip = bio->bi_private;
-       struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
-       struct bio_vec *bvec = bio->bi_io_vec;
+       struct bio_vec *bvec;
         struct inode *inode = dip->inode;
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct bio *dio_bio;
         u32 *csums = (u32 *)dip->csum;
-       int index = 0;
         u64 start;
+       int i;
   
         start = dip->logical_offset;
-       do {
+       bio_for_each_segment_all(bvec, bio, i) {
                 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
                         struct page *page = bvec->bv_page;
                         char *kaddr;
@@@ -6809,18 -6804,16 +6808,16 @@@
                         local_irq_restore(flags);
   
                         flush_dcache_page(bvec->bv_page);
-                       if (csum != csums[index]) {
+                       if (csum != csums[i]) {
                                 btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
                                           btrfs_ino(inode), start, csum,
-                                         csums[index]);
+                                         csums[i]);
                                 err = -EIO;
                         }
                 }
   
                 start += bvec->bv_len;
-               bvec++;
-               index++;
-       } while (bvec <= bvec_end);
+       }
   
         unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
                       dip->logical_offset + dip->bytes - 1);
@@@ -6901,7 -6894,8 +6898,8 @@@ static void btrfs_end_dio_bio(struct bi
                 printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
                       "sector %#Lx len %u err no %d\n",
                       btrfs_ino(dip->inode), bio->bi_rw,
-                     (unsigned long long)bio->bi_sector, bio->bi_size, err);
+                     (unsigned long long)bio->bi_iter.bi_sector,
+                     bio->bi_iter.bi_size, err);
                 dip->errors = 1;
   
                 /*
@@@ -6992,7 -6986,7 +6990,7 @@@ static int btrfs_submit_direct_hook(in
         struct bio *bio;
         struct bio *orig_bio = dip->orig_bio;
         struct bio_vec *bvec = orig_bio->bi_io_vec;
-       u64 start_sector = orig_bio->bi_sector;
+       u64 start_sector = orig_bio->bi_iter.bi_sector;
         u64 file_offset = dip->logical_offset;
         u64 submit_len = 0;
         u64 map_length;
@@@ -7000,7 -6994,7 +6998,7 @@@
         int ret = 0;
         int async_submit = 0;
   
-       map_length = orig_bio->bi_size;
+       map_length = orig_bio->bi_iter.bi_size;
         ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
                               &map_length, NULL, 0);
         if (ret) {
@@@ -7008,7 -7002,7 +7006,7 @@@
                 return -EIO;
         }
   
-       if (map_length >= orig_bio->bi_size) {
+       if (map_length >= orig_bio->bi_iter.bi_size) {
                 bio = orig_bio;
                 goto submit;
         }
@@@ -7060,7 -7054,7 +7058,7 @@@
                         bio->bi_private = dip;
                         bio->bi_end_io = btrfs_end_dio_bio;
   
-                       map_length = orig_bio->bi_size;
+                       map_length = orig_bio->bi_iter.bi_size;
                         ret = btrfs_map_block(root->fs_info, rw,
                                               start_sector << 9,
                                               &map_length, NULL, 0);
@@@ -7118,7 -7112,8 +7116,8 @@@ static void btrfs_submit_direct(int rw
   
         if (!skip_sum && !write) {
                 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-               sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits;
+               sum_len = dio_bio->bi_iter.bi_size >>
+                       inode->i_sb->s_blocksize_bits;
                 sum_len *= csum_size;
         } else {
                 sum_len = 0;
@@@ -7133,8 -7128,8 +7132,8 @@@
         dip->private = dio_bio->bi_private;
         dip->inode = inode;
         dip->logical_offset = file_offset;
-       dip->bytes = dio_bio->bi_size;
-       dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
+       dip->bytes = dio_bio->bi_iter.bi_size;
+       dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
         io_bio->bi_private = dip;
         dip->errors = 0;
         dip->orig_bio = io_bio;
@@@ -8653,14 -8648,12 +8652,14 @@@ static const struct inode_operations bt
         .removexattr    = btrfs_removexattr,
         .permission     = btrfs_permission,
         .get_acl        = btrfs_get_acl,
+ +      .set_acl        = btrfs_set_acl,
         .update_time    = btrfs_update_time,
   };
   static const struct inode_operations btrfs_dir_ro_inode_operations = {
         .lookup         = btrfs_lookup,
         .permission     = btrfs_permission,
         .get_acl        = btrfs_get_acl,
+ +      .set_acl        = btrfs_set_acl,
         .update_time    = btrfs_update_time,
   };
   
@@@ -8730,7 -8723,6 +8729,7 @@@ static const struct inode_operations bt
         .permission     = btrfs_permission,
         .fiemap         = btrfs_fiemap,
         .get_acl        = btrfs_get_acl,
+ +      .set_acl        = btrfs_set_acl,
         .update_time    = btrfs_update_time,
   };
   static const struct inode_operations btrfs_special_inode_operations = {
@@@ -8742,7 -8734,6 +8741,7 @@@
         .listxattr      = btrfs_listxattr,
         .removexattr    = btrfs_removexattr,
         .get_acl        = btrfs_get_acl,
+ +      .set_acl        = btrfs_set_acl,
         .update_time    = btrfs_update_time,
   };
   static const struct inode_operations btrfs_symlink_inode_operations = {
@@@ -8756,6 -8747,7 +8755,6 @@@
         .getxattr       = btrfs_getxattr,
         .listxattr      = btrfs_listxattr,
         .removexattr    = btrfs_removexattr,
- -      .get_acl        = btrfs_get_acl,
         .update_time    = btrfs_update_time,
   };
   
diff --combined fs/f2fs/data.c

index 0ae558723506e1a8a96f5653444dc11f5a8feb27,a2c8de8ba6ce6d45e9450de5ab8b6c60fb01adea..2261ccdd0b5f04a37be390f1b28c8703fafa86b4
--- 1/fs/f2fs/data.c
--- 2/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@@ -24,195 -24,6 +24,188 @@@
   #include "segment.h"
   #include <trace/events/f2fs.h>
   
-       const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-       struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ +static void f2fs_read_end_io(struct bio *bio, int err)
+ +{
-       do {
++      struct bio_vec *bvec;
++      int i;
+ +
-               if (--bvec >= bio->bi_io_vec)
-                       prefetchw(&bvec->bv_page->flags);
- 
-               if (unlikely(!uptodate)) {
++      bio_for_each_segment_all(bvec, bio, i) {
+ +              struct page *page = bvec->bv_page;
+ +
-               } else {
-                       SetPageUptodate(page);
++              if (!err) {
++                      SetPageUptodate(page);
++              } else {
+ +                      ClearPageUptodate(page);
+ +                      SetPageError(page);
-       } while (bvec >= bio->bi_io_vec);
- 
+ +              }
+ +              unlock_page(page);
-       const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-       struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-       struct f2fs_sb_info *sbi = F2FS_SB(bvec->bv_page->mapping->host->i_sb);
++      }
+ +      bio_put(bio);
+ +}
+ +
+ +static void f2fs_write_end_io(struct bio *bio, int err)
+ +{
-       do {
++      struct f2fs_sb_info *sbi = F2FS_SB(bio->bi_io_vec->bv_page->mapping->host->i_sb);
++      struct bio_vec *bvec;
++      int i;
+ +
-               if (--bvec >= bio->bi_io_vec)
-                       prefetchw(&bvec->bv_page->flags);
- 
-               if (unlikely(!uptodate)) {
++      bio_for_each_segment_all(bvec, bio, i) {
+ +              struct page *page = bvec->bv_page;
+ +
-       } while (bvec >= bio->bi_io_vec);
++              if (unlikely(err)) {
+ +                      SetPageError(page);
+ +                      set_bit(AS_EIO, &page->mapping->flags);
+ +                      set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+ +                      sbi->sb->s_flags |= MS_RDONLY;
+ +              }
+ +              end_page_writeback(page);
+ +              dec_page_count(sbi, F2FS_WRITEBACK);
-       bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
++      }
+ +
+ +      if (bio->bi_private)
+ +              complete(bio->bi_private);
+ +
+ +      if (!get_pages(sbi, F2FS_WRITEBACK) &&
+ +                      !list_empty(&sbi->cp_wait.task_list))
+ +              wake_up(&sbi->cp_wait);
+ +
+ +      bio_put(bio);
+ +}
+ +
+ +/*
+ + * Low-level block read/write IO operations.
+ + */
+ +static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
+ +                              int npages, bool is_read)
+ +{
+ +      struct bio *bio;
+ +
+ +      /* No failure on bio allocation */
+ +      bio = bio_alloc(GFP_NOIO, npages);
+ +
+ +      bio->bi_bdev = sbi->sb->s_bdev;
++      bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+ +      bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
+ +
+ +      return bio;
+ +}
+ +
+ +static void __submit_merged_bio(struct f2fs_bio_info *io)
+ +{
+ +      struct f2fs_io_info *fio = &io->fio;
+ +      int rw;
+ +
+ +      if (!io->bio)
+ +              return;
+ +
+ +      rw = fio->rw;
+ +
+ +      if (is_read_io(rw)) {
+ +              trace_f2fs_submit_read_bio(io->sbi->sb, rw,
+ +                                              fio->type, io->bio);
+ +              submit_bio(rw, io->bio);
+ +      } else {
+ +              trace_f2fs_submit_write_bio(io->sbi->sb, rw,
+ +                                              fio->type, io->bio);
+ +              /*
+ +               * META_FLUSH is only from the checkpoint procedure, and we
+ +               * should wait this metadata bio for FS consistency.
+ +               */
+ +              if (fio->type == META_FLUSH) {
+ +                      DECLARE_COMPLETION_ONSTACK(wait);
+ +                      io->bio->bi_private = &wait;
+ +                      submit_bio(rw, io->bio);
+ +                      wait_for_completion(&wait);
+ +              } else {
+ +                      submit_bio(rw, io->bio);
+ +              }
+ +      }
+ +
+ +      io->bio = NULL;
+ +}
+ +
+ +void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
+ +                              enum page_type type, int rw)
+ +{
+ +      enum page_type btype = PAGE_TYPE_OF_BIO(type);
+ +      struct f2fs_bio_info *io;
+ +
+ +      io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
+ +
+ +      mutex_lock(&io->io_mutex);
+ +
+ +      /* change META to META_FLUSH in the checkpoint procedure */
+ +      if (type >= META_FLUSH) {
+ +              io->fio.type = META_FLUSH;
+ +              io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
+ +      }
+ +      __submit_merged_bio(io);
+ +      mutex_unlock(&io->io_mutex);
+ +}
+ +
+ +/*
+ + * Fill the locked page with data located in the block address.
+ + * Return unlocked page.
+ + */
+ +int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
+ +                                      block_t blk_addr, int rw)
+ +{
+ +      struct bio *bio;
+ +
+ +      trace_f2fs_submit_page_bio(page, blk_addr, rw);
+ +
+ +      /* Allocate a new bio */
+ +      bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw));
+ +
+ +      if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+ +              bio_put(bio);
+ +              f2fs_put_page(page, 1);
+ +              return -EFAULT;
+ +      }
+ +
+ +      submit_bio(rw, bio);
+ +      return 0;
+ +}
+ +
+ +void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
+ +                      block_t blk_addr, struct f2fs_io_info *fio)
+ +{
+ +      enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
+ +      struct f2fs_bio_info *io;
+ +      bool is_read = is_read_io(fio->rw);
+ +
+ +      io = is_read ? &sbi->read_io : &sbi->write_io[btype];
+ +
+ +      verify_block_addr(sbi, blk_addr);
+ +
+ +      mutex_lock(&io->io_mutex);
+ +
+ +      if (!is_read)
+ +              inc_page_count(sbi, F2FS_WRITEBACK);
+ +
+ +      if (io->bio && (io->last_block_in_bio != blk_addr - 1 ||
+ +                                              io->fio.rw != fio->rw))
+ +              __submit_merged_bio(io);
+ +alloc_new:
+ +      if (io->bio == NULL) {
+ +              int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+ +
+ +              io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
+ +              io->fio = *fio;
+ +      }
+ +
+ +      if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
+ +                                                      PAGE_CACHE_SIZE) {
+ +              __submit_merged_bio(io);
+ +              goto alloc_new;
+ +      }
+ +
+ +      io->last_block_in_bio = blk_addr;
+ +
+ +      mutex_unlock(&io->io_mutex);
+ +      trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr);
+ +}
+ +
   /*
    * Lock ordering for the change of data block address:
    * ->data_page
@@@ -226,7 -37,7 +219,7 @@@ static void __set_data_blkaddr(struct d
         struct page *node_page = dn->node_page;
         unsigned int ofs_in_node = dn->ofs_in_node;
   
- -      f2fs_wait_on_page_writeback(node_page, NODE, false);
+ +      f2fs_wait_on_page_writeback(node_page, NODE);
   
         rn = F2FS_NODE(node_page);
   
@@@ -240,39 -51,19 +233,39 @@@ int reserve_new_block(struct dnode_of_d
   {
         struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
   
- -      if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
+ +      if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
                 return -EPERM;
- -      if (!inc_valid_block_count(sbi, dn->inode, 1))
+ +      if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
                 return -ENOSPC;
   
         trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
   
         __set_data_blkaddr(dn, NEW_ADDR);
         dn->data_blkaddr = NEW_ADDR;
+ +      mark_inode_dirty(dn->inode);
         sync_inode_page(dn);
         return 0;
   }
   
+ +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
+ +{
+ +      bool need_put = dn->inode_page ? false : true;
+ +      int err;
+ +
+ +      /* if inode_page exists, index should be zero */
+ +      f2fs_bug_on(!need_put && index);
+ +
+ +      err = get_dnode_of_data(dn, index, ALLOC_NODE);
+ +      if (err)
+ +              return err;
+ +
+ +      if (dn->data_blkaddr == NULL_ADDR)
+ +              err = reserve_new_block(dn);
+ +      if (err || need_put)
+ +              f2fs_put_dnode(dn);
+ +      return err;
+ +}
+ +
   static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
                                         struct buffer_head *bh_result)
   {
@@@ -280,9 -71,6 +273,9 @@@
         pgoff_t start_fofs, end_fofs;
         block_t start_blkaddr;
   
+ +      if (is_inode_flag_set(fi, FI_NO_EXTENT))
+ +              return 0;
+ +
         read_lock(&fi->ext.ext_lock);
         if (fi->ext.len == 0) {
                 read_unlock(&fi->ext.ext_lock);
@@@ -321,7 -109,6 +314,7 @@@ void update_extent_cache(block_t blk_ad
         struct f2fs_inode_info *fi = F2FS_I(dn->inode);
         pgoff_t fofs, start_fofs, end_fofs;
         block_t start_blkaddr, end_blkaddr;
+ +      int need_update = true;
   
         f2fs_bug_on(blk_addr == NEW_ADDR);
         fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
@@@ -330,9 -117,6 +323,9 @@@
         /* Update the page address in the parent node */
         __set_data_blkaddr(dn, blk_addr);
   
+ +      if (is_inode_flag_set(fi, FI_NO_EXTENT))
+ +              return;
+ +
         write_lock(&fi->ext.ext_lock);
   
         start_fofs = fi->ext.fofs;
@@@ -379,21 -163,14 +372,21 @@@
                                         fofs - start_fofs + 1;
                         fi->ext.len -= fofs - start_fofs + 1;
                 }
- -              goto end_update;
+ +      } else {
+ +              need_update = false;
         }
- -      write_unlock(&fi->ext.ext_lock);
- -      return;
   
+ +      /* Finally, if the extent is very fragmented, let's drop the cache. */
+ +      if (fi->ext.len < F2FS_MIN_EXTENT_LEN) {
+ +              fi->ext.len = 0;
+ +              set_inode_flag(fi, FI_NO_EXTENT);
+ +              need_update = true;
+ +      }
   end_update:
         write_unlock(&fi->ext.ext_lock);
- -      sync_inode_page(dn);
+ +      if (need_update)
+ +              sync_inode_page(dn);
+ +      return;
   }
   
   struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
@@@ -419,7 -196,7 +412,7 @@@
                 return ERR_PTR(-ENOENT);
   
         /* By fallocate(), there is no cached page, but with NEW_ADDR */
- -      if (dn.data_blkaddr == NEW_ADDR)
+ +      if (unlikely(dn.data_blkaddr == NEW_ADDR))
                 return ERR_PTR(-EINVAL);
   
         page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
@@@ -431,14 -208,11 +424,14 @@@
                 return page;
         }
   
- -      err = f2fs_readpage(sbi, page, dn.data_blkaddr,
+ +      err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
                                         sync ? READ_SYNC : READA);
+ +      if (err)
+ +              return ERR_PTR(err);
+ +
         if (sync) {
                 wait_on_page_locked(page);
- -              if (!PageUptodate(page)) {
+ +              if (unlikely(!PageUptodate(page))) {
                         f2fs_put_page(page, 0);
                         return ERR_PTR(-EIO);
                 }
@@@ -472,7 -246,7 +465,7 @@@ repeat
         }
         f2fs_put_dnode(&dn);
   
- -      if (dn.data_blkaddr == NULL_ADDR) {
+ +      if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
                 f2fs_put_page(page, 1);
                 return ERR_PTR(-ENOENT);
         }
@@@ -492,16 -266,16 +485,16 @@@
                 return page;
         }
   
- -      err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+ +      err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC);
         if (err)
                 return ERR_PTR(err);
   
         lock_page(page);
- -      if (!PageUptodate(page)) {
+ +      if (unlikely(!PageUptodate(page))) {
                 f2fs_put_page(page, 1);
                 return ERR_PTR(-EIO);
         }
- -      if (page->mapping != mapping) {
+ +      if (unlikely(page->mapping != mapping)) {
                 f2fs_put_page(page, 1);
                 goto repeat;
         }
@@@ -512,12 -286,12 +505,12 @@@
    * Caller ensures that this data page is never allocated.
    * A new zero-filled data page is allocated in the page cache.
    *
- - * Also, caller should grab and release a mutex by calling mutex_lock_op() and
- - * mutex_unlock_op().
- - * Note that, npage is set only by make_empty_dir.
+ + * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and
+ + * f2fs_unlock_op().
+ + * Note that, ipage is set only by make_empty_dir.
    */
   struct page *get_new_data_page(struct inode *inode,
- -              struct page *npage, pgoff_t index, bool new_i_size)
+ +              struct page *ipage, pgoff_t index, bool new_i_size)
   {
         struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
         struct address_space *mapping = inode->i_mapping;
@@@ -525,16 -299,24 +518,16 @@@
         struct dnode_of_data dn;
         int err;
   
- -      set_new_dnode(&dn, inode, npage, npage, 0);
- -      err = get_dnode_of_data(&dn, index, ALLOC_NODE);
+ +      set_new_dnode(&dn, inode, ipage, NULL, 0);
+ +      err = f2fs_reserve_block(&dn, index);
         if (err)
                 return ERR_PTR(err);
- -
- -      if (dn.data_blkaddr == NULL_ADDR) {
- -              if (reserve_new_block(&dn)) {
- -                      if (!npage)
- -                              f2fs_put_dnode(&dn);
- -                      return ERR_PTR(-ENOSPC);
- -              }
- -      }
- -      if (!npage)
- -              f2fs_put_dnode(&dn);
   repeat:
         page = grab_cache_page(mapping, index);
- -      if (!page)
- -              return ERR_PTR(-ENOMEM);
+ +      if (!page) {
+ +              err = -ENOMEM;
+ +              goto put_err;
+ +      }
   
         if (PageUptodate(page))
                 return page;
@@@ -543,18 -325,15 +536,18 @@@
                 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
                 SetPageUptodate(page);
         } else {
- -              err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+ +              err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+ +                                                              READ_SYNC);
                 if (err)
- -                      return ERR_PTR(err);
+ +                      goto put_err;
+ +
                 lock_page(page);
- -              if (!PageUptodate(page)) {
+ +              if (unlikely(!PageUptodate(page))) {
                         f2fs_put_page(page, 1);
- -                      return ERR_PTR(-EIO);
+ +                      err = -EIO;
+ +                      goto put_err;
                 }
- -              if (page->mapping != mapping) {
+ +              if (unlikely(page->mapping != mapping)) {
                         f2fs_put_page(page, 1);
                         goto repeat;
                 }
@@@ -565,187 -344,137 +558,187 @@@
                 i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
                 /* Only the directory inode sets new_i_size */
                 set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
- -              mark_inode_dirty_sync(inode);
         }
         return page;
- -}
- -
- -static void read_end_io(struct bio *bio, int err)
- -{
- -      struct bio_vec *bvec;
- -      int i;
- -
- -      bio_for_each_segment_all(bvec, bio, i) {
- -              struct page *page = bvec->bv_page;
   
- -              if (!err) {
- -                      SetPageUptodate(page);
- -              } else {
- -                      ClearPageUptodate(page);
- -                      SetPageError(page);
- -              }
- -              unlock_page(page);
- -      }
- -      bio_put(bio);
+ +put_err:
+ +      f2fs_put_dnode(&dn);
+ +      return ERR_PTR(err);
   }
   
- -/*
- - * Fill the locked page with data located in the block address.
- - * Return unlocked page.
- - */
- -int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
- -                                      block_t blk_addr, int type)
+ +static int __allocate_data_block(struct dnode_of_data *dn)
   {
- -      struct block_device *bdev = sbi->sb->s_bdev;
- -      struct bio *bio;
+ +      struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ +      struct f2fs_summary sum;
+ +      block_t new_blkaddr;
+ +      struct node_info ni;
+ +      int type;
   
- -      trace_f2fs_readpage(page, blk_addr, type);
+ +      if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+ +              return -EPERM;
+ +      if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
+ +              return -ENOSPC;
   
- -      down_read(&sbi->bio_sem);
+ +      __set_data_blkaddr(dn, NEW_ADDR);
+ +      dn->data_blkaddr = NEW_ADDR;
   
- -      /* Allocate a new bio */
- -      bio = f2fs_bio_alloc(bdev, 1);
+ +      get_node_info(sbi, dn->nid, &ni);
+ +      set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
   
- -      /* Initialize the bio */
- -      bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
- -      bio->bi_end_io = read_end_io;
+ +      type = CURSEG_WARM_DATA;
   
- -      if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
- -              bio_put(bio);
- -              up_read(&sbi->bio_sem);
- -              f2fs_put_page(page, 1);
- -              return -EFAULT;
- -      }
+ +      allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type);
   
- -      submit_bio(type, bio);
- -      up_read(&sbi->bio_sem);
+ +      /* direct IO doesn't use extent cache to maximize the performance */
+ +      set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
+ +      update_extent_cache(new_blkaddr, dn);
+ +      clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
+ +
+ +      dn->data_blkaddr = new_blkaddr;
         return 0;
   }
   
   /*
- - * This function should be used by the data read flow only where it
- - * does not check the "create" flag that indicates block allocation.
- - * The reason for this special functionality is to exploit VFS readahead
- - * mechanism.
+ + * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
+ + * If original data blocks are allocated, then give them to blockdev.
+ + * Otherwise,
+ + *     a. preallocate requested block addresses
+ + *     b. do not use extent cache for better performance
+ + *     c. give the block addresses to blockdev
    */
- -static int get_data_block_ro(struct inode *inode, sector_t iblock,
+ +static int get_data_block(struct inode *inode, sector_t iblock,
                         struct buffer_head *bh_result, int create)
   {
+ +      struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
         unsigned int blkbits = inode->i_sb->s_blocksize_bits;
         unsigned maxblocks = bh_result->b_size >> blkbits;
         struct dnode_of_data dn;
- -      pgoff_t pgofs;
- -      int err;
+ +      int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
+ +      pgoff_t pgofs, end_offset;
+ +      int err = 0, ofs = 1;
+ +      bool allocated = false;
   
         /* Get the page offset from the block offset(iblock) */
         pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
   
- -      if (check_extent_cache(inode, pgofs, bh_result)) {
- -              trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
- -              return 0;
- -      }
+ +      if (check_extent_cache(inode, pgofs, bh_result))
+ +              goto out;
+ +
+ +      if (create)
+ +              f2fs_lock_op(sbi);
   
         /* When reading holes, we need its node page */
         set_new_dnode(&dn, inode, NULL, NULL, 0);
- -      err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
+ +      err = get_dnode_of_data(&dn, pgofs, mode);
         if (err) {
- -              trace_f2fs_get_data_block(inode, iblock, bh_result, err);
- -              return (err == -ENOENT) ? 0 : err;
+ +              if (err == -ENOENT)
+ +                      err = 0;
+ +              goto unlock_out;
         }
+ +      if (dn.data_blkaddr == NEW_ADDR)
+ +              goto put_out;
   
- -      /* It does not support data allocation */
- -      f2fs_bug_on(create);
+ +      if (dn.data_blkaddr != NULL_ADDR) {
+ +              map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+ +      } else if (create) {
+ +              err = __allocate_data_block(&dn);
+ +              if (err)
+ +                      goto put_out;
+ +              allocated = true;
+ +              map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+ +      } else {
+ +              goto put_out;
+ +      }
   
- -      if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) {
- -              int i;
- -              unsigned int end_offset;
+ +      end_offset = IS_INODE(dn.node_page) ?
+ +                      ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
+ +      bh_result->b_size = (((size_t)1) << blkbits);
+ +      dn.ofs_in_node++;
+ +      pgofs++;
+ +
+ +get_next:
+ +      if (dn.ofs_in_node >= end_offset) {
+ +              if (allocated)
+ +                      sync_inode_page(&dn);
+ +              allocated = false;
+ +              f2fs_put_dnode(&dn);
   
- -              end_offset = IS_INODE(dn.node_page) ?
- -                              ADDRS_PER_INODE(F2FS_I(inode)) :
- -                              ADDRS_PER_BLOCK;
+ +              set_new_dnode(&dn, inode, NULL, NULL, 0);
+ +              err = get_dnode_of_data(&dn, pgofs, mode);
+ +              if (err) {
+ +                      if (err == -ENOENT)
+ +                              err = 0;
+ +                      goto unlock_out;
+ +              }
+ +              if (dn.data_blkaddr == NEW_ADDR)
+ +                      goto put_out;
   
- -              clear_buffer_new(bh_result);
+ +              end_offset = IS_INODE(dn.node_page) ?
+ +                      ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
+ +      }
   
+ +      if (maxblocks > (bh_result->b_size >> blkbits)) {
+ +              block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+ +              if (blkaddr == NULL_ADDR && create) {
+ +                      err = __allocate_data_block(&dn);
+ +                      if (err)
+ +                              goto sync_out;
+ +                      allocated = true;
+ +                      blkaddr = dn.data_blkaddr;
+ +              }
                 /* Give more consecutive addresses for the read ahead */
- -              for (i = 0; i < end_offset - dn.ofs_in_node; i++)
- -                      if (((datablock_addr(dn.node_page,
- -                                                      dn.ofs_in_node + i))
- -                              != (dn.data_blkaddr + i)) || maxblocks == i)
- -                              break;
- -              map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
- -              bh_result->b_size = (i << blkbits);
+ +              if (blkaddr == (bh_result->b_blocknr + ofs)) {
+ +                      ofs++;
+ +                      dn.ofs_in_node++;
+ +                      pgofs++;
+ +                      bh_result->b_size += (((size_t)1) << blkbits);
+ +                      goto get_next;
+ +              }
         }
+ +sync_out:
+ +      if (allocated)
+ +              sync_inode_page(&dn);
+ +put_out:
         f2fs_put_dnode(&dn);
- -      trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
- -      return 0;
+ +unlock_out:
+ +      if (create)
+ +              f2fs_unlock_op(sbi);
+ +out:
+ +      trace_f2fs_get_data_block(inode, iblock, bh_result, err);
+ +      return err;
   }
   
   static int f2fs_read_data_page(struct file *file, struct page *page)
   {
- -      return mpage_readpage(page, get_data_block_ro);
+ +      struct inode *inode = page->mapping->host;
+ +      int ret;
+ +
+ +      /* If the file has inline data, try to read it directlly */
+ +      if (f2fs_has_inline_data(inode))
+ +              ret = f2fs_read_inline_data(inode, page);
+ +      else
+ +              ret = mpage_readpage(page, get_data_block);
+ +
+ +      return ret;
   }
   
   static int f2fs_read_data_pages(struct file *file,
                         struct address_space *mapping,
                         struct list_head *pages, unsigned nr_pages)
   {
- -      return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro);
+ +      struct inode *inode = file->f_mapping->host;
+ +
+ +      /* If the file has inline data, skip readpages */
+ +      if (f2fs_has_inline_data(inode))
+ +              return 0;
+ +
+ +      return mpage_readpages(mapping, pages, nr_pages, get_data_block);
   }
   
- -int do_write_data_page(struct page *page)
+ +int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
   {
         struct inode *inode = page->mapping->host;
- -      block_t old_blk_addr, new_blk_addr;
+ +      block_t old_blkaddr, new_blkaddr;
         struct dnode_of_data dn;
         int err = 0;
   
@@@ -754,10 -483,10 +747,10 @@@
         if (err)
                 return err;
   
- -      old_blk_addr = dn.data_blkaddr;
+ +      old_blkaddr = dn.data_blkaddr;
   
         /* This page is already truncated */
- -      if (old_blk_addr == NULL_ADDR)
+ +      if (old_blkaddr == NULL_ADDR)
                 goto out_writepage;
   
         set_page_writeback(page);
@@@ -766,13 -495,15 +759,13 @@@
          * If current allocation needs SSR,
          * it had better in-place writes for updated data.
          */
- -      if (unlikely(old_blk_addr != NEW_ADDR &&
+ +      if (unlikely(old_blkaddr != NEW_ADDR &&
                         !is_cold_data(page) &&
                         need_inplace_update(inode))) {
- -              rewrite_data_page(F2FS_SB(inode->i_sb), page,
- -                                              old_blk_addr);
+ +              rewrite_data_page(page, old_blkaddr, fio);
         } else {
- -              write_data_page(inode, page, &dn,
- -                              old_blk_addr, &new_blk_addr);
- -              update_extent_cache(new_blk_addr, &dn);
+ +              write_data_page(page, &dn, &new_blkaddr, fio);
+ +              update_extent_cache(new_blkaddr, &dn);
         }
   out_writepage:
         f2fs_put_dnode(&dn);
@@@ -787,13 -518,9 +780,13 @@@ static int f2fs_write_data_page(struct 
         loff_t i_size = i_size_read(inode);
         const pgoff_t end_index = ((unsigned long long) i_size)
                                                         >> PAGE_CACHE_SHIFT;
- -      unsigned offset;
+ +      unsigned offset = 0;
         bool need_balance_fs = false;
         int err = 0;
+ +      struct f2fs_io_info fio = {
+ +              .type = DATA,
+ +              .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+ +      };
   
         if (page->index < end_index)
                 goto write;
@@@ -813,7 -540,7 +806,7 @@@
   
         zero_user_segment(page, offset, PAGE_CACHE_SIZE);
   write:
- -      if (sbi->por_doing) {
+ +      if (unlikely(sbi->por_doing)) {
                 err = AOP_WRITEPAGE_ACTIVATE;
                 goto redirty_out;
         }
@@@ -822,18 -549,10 +815,18 @@@
         if (S_ISDIR(inode->i_mode)) {
                 dec_page_count(sbi, F2FS_DIRTY_DENTS);
                 inode_dec_dirty_dents(inode);
- -              err = do_write_data_page(page);
+ +              err = do_write_data_page(page, &fio);
         } else {
                 f2fs_lock_op(sbi);
- -              err = do_write_data_page(page);
+ +
+ +              if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) {
+ +                      err = f2fs_write_inline_data(inode, page, offset);
+ +                      f2fs_unlock_op(sbi);
+ +                      goto out;
+ +              } else {
+ +                      err = do_write_data_page(page, &fio);
+ +              }
+ +
                 f2fs_unlock_op(sbi);
                 need_balance_fs = true;
         }
@@@ -842,10 -561,8 +835,10 @@@
         else if (err)
                 goto redirty_out;
   
- -      if (wbc->for_reclaim)
- -              f2fs_submit_bio(sbi, DATA, true);
+ +      if (wbc->for_reclaim) {
+ +              f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ +              need_balance_fs = false;
+ +      }
   
         clear_cold_data(page);
   out:
@@@ -897,8 -614,7 +890,8 @@@ static int f2fs_write_data_pages(struc
         ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
         if (locked)
                 mutex_unlock(&sbi->writepages);
- -      f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL));
+ +
+ +      f2fs_submit_merged_bio(sbi, DATA, WRITE);
   
         remove_dirty_dir_inode(inode);
   
@@@ -919,28 -635,27 +912,28 @@@ static int f2fs_write_begin(struct fil
   
         f2fs_balance_fs(sbi);
   repeat:
+ +      err = f2fs_convert_inline_data(inode, pos + len);
+ +      if (err)
+ +              return err;
+ +
         page = grab_cache_page_write_begin(mapping, index, flags);
         if (!page)
                 return -ENOMEM;
         *pagep = page;
   
- -      f2fs_lock_op(sbi);
+ +      if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA)
+ +              goto inline_data;
   
+ +      f2fs_lock_op(sbi);
         set_new_dnode(&dn, inode, NULL, NULL, 0);
- -      err = get_dnode_of_data(&dn, index, ALLOC_NODE);
- -      if (err)
- -              goto err;
- -
- -      if (dn.data_blkaddr == NULL_ADDR)
- -              err = reserve_new_block(&dn);
- -
- -      f2fs_put_dnode(&dn);
- -      if (err)
- -              goto err;
- -
+ +      err = f2fs_reserve_block(&dn, index);
         f2fs_unlock_op(sbi);
   
+ +      if (err) {
+ +              f2fs_put_page(page, 1);
+ +              return err;
+ +      }
+ +inline_data:
         if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
                 return 0;
   
@@@ -956,19 -671,15 +949,19 @@@
         if (dn.data_blkaddr == NEW_ADDR) {
                 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
         } else {
- -              err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+ +              if (f2fs_has_inline_data(inode))
+ +                      err = f2fs_read_inline_data(inode, page);
+ +              else
+ +                      err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+ +                                                      READ_SYNC);
                 if (err)
                         return err;
                 lock_page(page);
- -              if (!PageUptodate(page)) {
+ +              if (unlikely(!PageUptodate(page))) {
                         f2fs_put_page(page, 1);
                         return -EIO;
                 }
- -              if (page->mapping != mapping) {
+ +              if (unlikely(page->mapping != mapping)) {
                         f2fs_put_page(page, 1);
                         goto repeat;
                 }
@@@ -977,6 -688,11 +970,6 @@@ out
         SetPageUptodate(page);
         clear_cold_data(page);
         return 0;
- -
- -err:
- -      f2fs_unlock_op(sbi);
- -      f2fs_put_page(page, 1);
- -      return err;
   }
   
   static int f2fs_write_end(struct file *file,
@@@ -995,43 -711,23 +988,43 @@@
                 update_inode_page(inode);
         }
   
- -      unlock_page(page);
- -      page_cache_release(page);
+ +      f2fs_put_page(page, 1);
         return copied;
   }
   
+ +static int check_direct_IO(struct inode *inode, int rw,
+ +              const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+ +{
+ +      unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
+ +      int i;
+ +
+ +      if (rw == READ)
+ +              return 0;
+ +
+ +      if (offset & blocksize_mask)
+ +              return -EINVAL;
+ +
+ +      for (i = 0; i < nr_segs; i++)
+ +              if (iov[i].iov_len & blocksize_mask)
+ +                      return -EINVAL;
+ +      return 0;
+ +}
+ +
   static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
                 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
   
- -      if (rw == WRITE)
+ +      /* Let buffer I/O handle the inline data case. */
+ +      if (f2fs_has_inline_data(inode))
+ +              return 0;
+ +
+ +      if (check_direct_IO(inode, rw, iov, offset, nr_segs))
                 return 0;
   
- -      /* Needs synchronization with the cleaner */
         return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- -                                                get_data_block_ro);
+ +                                                      get_data_block);
   }
   
   static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
@@@ -1060,8 -756,6 +1053,8 @@@ static int f2fs_set_data_page_dirty(str
         trace_f2fs_set_page_dirty(page, DATA);
   
         SetPageUptodate(page);
+ +      mark_inode_dirty(inode);
+ +
         if (!PageDirty(page)) {
                 __set_page_dirty_nobuffers(page);
                 set_dirty_dir_page(inode, page);
@@@ -1072,7 -766,7 +1065,7 @@@
   
   static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
   {
- -      return generic_block_bmap(mapping, block, get_data_block_ro);
+ +      return generic_block_bmap(mapping, block, get_data_block);
   }
   
   const struct address_space_operations f2fs_dblock_aops = {
diff --combined fs/gfs2/lops.c

index 58f06400b7b8dcece9597b51b05ebf9ebc092396,985da945f0b57cc1b77465cb12bf53b624806955..76693793ceddfe7f936c360a6c3494d1882a849a
--- 1/fs/gfs2/lops.c
--- 2/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@@ -83,7 -83,6 +83,7 @@@ static void maybe_release_space(struct 
                bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
         clear_bit(GBF_FULL, &bi->bi_flags);
         rgd->rd_free_clone = rgd->rd_free;
+ +      rgd->rd_extfail_pt = rgd->rd_free;
   }
   
   /**
@@@ -273,7 -272,7 +273,7 @@@ static struct bio *gfs2_log_alloc_bio(s
                 nrvecs = max(nrvecs/2, 1U);
         }
   
-       bio->bi_sector = blkno * (sb->s_blocksize >> 9);
+       bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
         bio->bi_bdev = sb->s_bdev;
         bio->bi_end_io = gfs2_end_log_write;
         bio->bi_private = sdp;
@@@ -589,12 -588,8 +589,12 @@@ static int buf_lo_scan_elements(struct 
   static void gfs2_meta_sync(struct gfs2_glock *gl)
   {
         struct address_space *mapping = gfs2_glock2aspace(gl);
+ +      struct gfs2_sbd *sdp = gl->gl_sbd;
         int error;
   
+ +      if (mapping == NULL)
+ +              mapping = &sdp->sd_aspace;
+ +
         filemap_fdatawrite(mapping);
         error = filemap_fdatawait(mapping);
   
diff --combined fs/gfs2/ops_fstype.c

index 1e712b566d76a74435b4d2faa5417956815cec78,16194da91652becfca5019a296f097206358e1d2..c6872d09561a2d53c8e57374eb700f4fb578ae78
--- 1/fs/gfs2/ops_fstype.c
--- 2/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@@ -36,7 -36,6 +36,7 @@@
   #include "log.h"
   #include "quota.h"
   #include "dir.h"
+ +#include "meta_io.h"
   #include "trace_gfs2.h"
   
   #define DO 0
@@@ -63,7 -62,6 +63,7 @@@ static void gfs2_tune_init(struct gfs2_
   static struct gfs2_sbd *init_sbd(struct super_block *sb)
   {
         struct gfs2_sbd *sdp;
+ +      struct address_space *mapping;
   
         sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
         if (!sdp)
@@@ -99,18 -97,6 +99,18 @@@
         init_waitqueue_head(&sdp->sd_quota_wait);
         INIT_LIST_HEAD(&sdp->sd_trunc_list);
         spin_lock_init(&sdp->sd_trunc_lock);
+ +      spin_lock_init(&sdp->sd_bitmap_lock);
+ +
+ +      mapping = &sdp->sd_aspace;
+ +
+ +      address_space_init_once(mapping);
+ +      mapping->a_ops = &gfs2_meta_aops;
+ +      mapping->host = sb->s_bdev->bd_inode;
+ +      mapping->flags = 0;
+ +      mapping_set_gfp_mask(mapping, GFP_NOFS);
+ +      mapping->private_data = NULL;
+ +      mapping->backing_dev_info = sb->s_bdi;
+ +      mapping->writeback_index = 0;
   
         spin_lock_init(&sdp->sd_log_lock);
         atomic_set(&sdp->sd_log_pinned, 0);
@@@ -231,14 -217,14 +231,14 @@@ static int gfs2_read_super(struct gfs2_
   
         page = alloc_page(GFP_NOFS);
         if (unlikely(!page))
- -              return -ENOBUFS;
+ +              return -ENOMEM;
   
         ClearPageUptodate(page);
         ClearPageDirty(page);
         lock_page(page);
   
         bio = bio_alloc(GFP_NOFS, 1);
-       bio->bi_sector = sector * (sb->s_blocksize >> 9);
+       bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
         bio->bi_bdev = sb->s_bdev;
         bio_add_page(bio, page, PAGE_SIZE, 0);
   
@@@ -970,6 -956,40 +970,6 @@@ fail
         return error;
   }
   
- -static int init_threads(struct gfs2_sbd *sdp, int undo)
- -{
- -      struct task_struct *p;
- -      int error = 0;
- -
- -      if (undo)
- -              goto fail_quotad;
- -
- -      p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
- -      if (IS_ERR(p)) {
- -              error = PTR_ERR(p);
- -              fs_err(sdp, "can't start logd thread: %d\n", error);
- -              return error;
- -      }
- -      sdp->sd_logd_process = p;
- -
- -      p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
- -      if (IS_ERR(p)) {
- -              error = PTR_ERR(p);
- -              fs_err(sdp, "can't start quotad thread: %d\n", error);
- -              goto fail;
- -      }
- -      sdp->sd_quotad_process = p;
- -
- -      return 0;
- -
- -
- -fail_quotad:
- -      kthread_stop(sdp->sd_quotad_process);
- -fail:
- -      kthread_stop(sdp->sd_logd_process);
- -      return error;
- -}
- -
   static const match_table_t nolock_tokens = {
         { Opt_jid, "jid=%d\n", },
         { Opt_err, NULL },
@@@ -1234,11 -1254,15 +1234,11 @@@ static int fill_super(struct super_bloc
                 goto fail_per_node;
         }
   
- -      error = init_threads(sdp, DO);
- -      if (error)
- -              goto fail_per_node;
- -
         if (!(sb->s_flags & MS_RDONLY)) {
                 error = gfs2_make_fs_rw(sdp);
                 if (error) {
                         fs_err(sdp, "can't make FS RW: %d\n", error);
- -                      goto fail_threads;
+ +                      goto fail_per_node;
                 }
         }
   
@@@ -1246,6 -1270,8 +1246,6 @@@
         gfs2_online_uevent(sdp);
         return 0;
   
- -fail_threads:
- -      init_threads(sdp, UNDO);
   fail_per_node:
         init_per_node(sdp, UNDO);
   fail_inodes:
@@@ -1340,18 -1366,8 +1340,18 @@@ static struct dentry *gfs2_mount(struc
         if (IS_ERR(s))
                 goto error_bdev;
   
- -      if (s->s_root)
+ +      if (s->s_root) {
+ +              /*
+ +               * s_umount nests inside bd_mutex during
+ +               * __invalidate_device().  blkdev_put() acquires
+ +               * bd_mutex and can't be called under s_umount.  Drop
+ +               * s_umount temporarily.  This is safe as we're
+ +               * holding an active reference.
+ +               */
+ +              up_write(&s->s_umount);
                 blkdev_put(bdev, mode);
+ +              down_write(&s->s_umount);
+ +      }
   
         memset(&args, 0, sizeof(args));
         args.ar_quota = GFS2_QUOTA_DEFAULT;
diff --combined fs/xfs/xfs_aops.c

index a26739451b535cf02a8016c423583f76a26bac72,1b19b9cd692ad8ff4d73259f7353b1e6a86477ad..db2cfb067d0b1ea88f8b64875ceb174d3ae582d2
--- 1/fs/xfs/xfs_aops.c
--- 2/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@@ -407,7 -407,7 +407,7 @@@ xfs_alloc_ioend_bio
         struct bio              *bio = bio_alloc(GFP_NOIO, nvecs);
   
         ASSERT(bio->bi_private == NULL);
-       bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+       bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
         bio->bi_bdev = bh->b_bdev;
         return bio;
   }
@@@ -1217,7 -1217,7 +1217,7 @@@ __xfs_get_blocks
                 lockmode = XFS_ILOCK_EXCL;
                 xfs_ilock(ip, lockmode);
         } else {
- -              lockmode = xfs_ilock_map_shared(ip);
+ +              lockmode = xfs_ilock_data_map_shared(ip);
         }
   
         ASSERT(offset <= mp->m_super->s_maxbytes);
diff --combined fs/xfs/xfs_buf.c

index 51757113a822abc57334bbc25f0251671fdd3266,2a941ab623cb1b32498e9aadfcddda63084df1f8..9c061ef2b0d973c913a1baaee4a43bc27523b244
--- 1/fs/xfs/xfs_buf.c
--- 2/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@@ -445,8 -445,8 +445,8 @@@ _xfs_buf_find
         numbytes = BBTOB(numblks);
   
         /* Check for IOs smaller than the sector size / not sector aligned */
- -      ASSERT(!(numbytes < (1 << btp->bt_sshift)));
- -      ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
+ +      ASSERT(!(numbytes < btp->bt_meta_sectorsize));
+ +      ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
   
         /*
          * Corrupted block numbers can get through to here, unfortunately, so we
@@@ -1240,7 -1240,7 +1240,7 @@@ next_chunk
   
         bio = bio_alloc(GFP_NOIO, nr_pages);
         bio->bi_bdev = bp->b_target->bt_bdev;
-       bio->bi_sector = sector;
+       bio->bi_iter.bi_sector = sector;
         bio->bi_end_io = xfs_buf_bio_end_io;
         bio->bi_private = bp;
   
@@@ -1262,7 -1262,7 +1262,7 @@@
                 total_nr_pages--;
         }
   
-       if (likely(bio->bi_size)) {
+       if (likely(bio->bi_iter.bi_size)) {
                 if (xfs_buf_is_vmapped(bp)) {
                         flush_kernel_vmap_range(bp->b_addr,
                                                 xfs_buf_vmap_len(bp));
@@@ -1593,15 -1593,16 +1593,15 @@@ xfs_free_buftarg
         kmem_free(btp);
   }
   
- -STATIC int
- -xfs_setsize_buftarg_flags(
+ +int
+ +xfs_setsize_buftarg(
         xfs_buftarg_t           *btp,
         unsigned int            blocksize,
- -      unsigned int            sectorsize,
- -      int                     verbose)
+ +      unsigned int            sectorsize)
   {
- -      btp->bt_bsize = blocksize;
- -      btp->bt_sshift = ffs(sectorsize) - 1;
- -      btp->bt_smask = sectorsize - 1;
+ +      /* Set up metadata sector size info */
+ +      btp->bt_meta_sectorsize = sectorsize;
+ +      btp->bt_meta_sectormask = sectorsize - 1;
   
         if (set_blocksize(btp->bt_bdev, sectorsize)) {
                 char name[BDEVNAME_SIZE];
@@@ -1614,25 -1615,30 +1614,25 @@@
                 return EINVAL;
         }
   
+ +      /* Set up device logical sector size mask */
+ +      btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
+ +      btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
+ +
         return 0;
   }
   
   /*
- - *    When allocating the initial buffer target we have not yet
- - *    read in the superblock, so don't know what sized sectors
- - *    are being used at this early stage.  Play safe.
+ + * When allocating the initial buffer target we have not yet
+ + * read in the superblock, so don't know what sized sectors
+ + * are being used at this early stage.  Play safe.
    */
   STATIC int
   xfs_setsize_buftarg_early(
         xfs_buftarg_t           *btp,
         struct block_device     *bdev)
   {
- -      return xfs_setsize_buftarg_flags(btp,
- -                      PAGE_SIZE, bdev_logical_block_size(bdev), 0);
- -}
- -
- -int
- -xfs_setsize_buftarg(
- -      xfs_buftarg_t           *btp,
- -      unsigned int            blocksize,
- -      unsigned int            sectorsize)
- -{
- -      return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
+ +      return xfs_setsize_buftarg(btp, PAGE_SIZE,
+ +                                 bdev_logical_block_size(bdev));
   }
   
   xfs_buftarg_t *
diff --combined include/linux/ceph/messenger.h

index 20ee8b63a96848ad1bc63fb29ce97c853502d700,091fdb600d55bbde721ef15d06cbb5494e322ebc..d21f2dba07314c48dce2414c4be23d2191180c81
--- 1/include/linux/ceph/messenger.h
--- 2/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@@ -1,6 -1,7 +1,7 @@@
   #ifndef __FS_CEPH_MESSENGER_H
   #define __FS_CEPH_MESSENGER_H
   
+ #include <linux/blk_types.h>
   #include <linux/kref.h>
   #include <linux/mutex.h>
   #include <linux/net.h>
@@@ -60,8 -61,8 +61,8 @@@ struct ceph_messenger 
         u32 global_seq;
         spinlock_t global_seq_lock;
   
- -      u32 supported_features;
- -      u32 required_features;
+ +      u64 supported_features;
+ +      u64 required_features;
   };
   
   enum ceph_msg_data_type {
@@@ -119,8 -120,7 +120,7 @@@ struct ceph_msg_data_cursor 
   #ifdef CONFIG_BLOCK
                 struct {                                /* bio */
                         struct bio      *bio;           /* bio from list */
-                       unsigned int    vector_index;   /* vector from bio */
-                       unsigned int    vector_offset;  /* bytes from vector */
+                       struct bvec_iter bvec_iter;
                 };
   #endif /* CONFIG_BLOCK */
                 struct {                                /* pages */
@@@ -154,9 -154,10 +154,9 @@@ struct ceph_msg 
         struct list_head list_head;     /* links for connection lists */
   
         struct kref kref;
- -      bool front_is_vmalloc;
         bool more_to_follow;
         bool needs_out_seq;
- -      int front_max;
+ +      int front_alloc_len;
         unsigned long ack_stamp;        /* tx: when we were acked */
   
         struct ceph_msgpool *pool;
@@@ -191,7 -192,7 +191,7 @@@ struct ceph_connection 
   
         struct ceph_entity_name peer_name; /* peer name */
   
- -      unsigned peer_features;
+ +      u64 peer_features;
         u32 connect_seq;      /* identify the most recent connection
                                  attempt for this connection, client */
         u32 peer_global_seq;  /* peer's global seq for this connection */
@@@ -255,8 -256,8 +255,8 @@@ extern void ceph_msgr_flush(void)
   
   extern void ceph_messenger_init(struct ceph_messenger *msgr,
                         struct ceph_entity_addr *myaddr,
- -                      u32 supported_features,
- -                      u32 required_features,
+ +                      u64 supported_features,
+ +                      u64 required_features,
                         bool nocrc);
   
   extern void ceph_con_init(struct ceph_connection *con, void *private,
diff --combined include/trace/events/f2fs.h

index 3b9f28dfc8492160940d28e58acf1dc9dc6e5081,bd3ee4fbe7a7fce24a0c8022ea59359dd79892b5..67f38faac589ad52ac5850e5af602799753b8d29
--- 1/include/trace/events/f2fs.h
--- 2/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@@ -16,28 -16,15 +16,28 @@@
                 { META,         "META" },                               \
                 { META_FLUSH,   "META_FLUSH" })
   
- -#define show_bio_type(type)                                           \
- -      __print_symbolic(type,                                          \
- -              { READ,         "READ" },                               \
- -              { READA,        "READAHEAD" },                          \
- -              { READ_SYNC,    "READ_SYNC" },                          \
- -              { WRITE,        "WRITE" },                              \
- -              { WRITE_SYNC,   "WRITE_SYNC" },                         \
- -              { WRITE_FLUSH,  "WRITE_FLUSH" },                        \
- -              { WRITE_FUA,    "WRITE_FUA" })
+ +#define F2FS_BIO_MASK(t)      (t & (READA | WRITE_FLUSH_FUA))
+ +#define F2FS_BIO_EXTRA_MASK(t)        (t & (REQ_META | REQ_PRIO))
+ +
+ +#define show_bio_type(type)   show_bio_base(type), show_bio_extra(type)
+ +
+ +#define show_bio_base(type)                                           \
+ +      __print_symbolic(F2FS_BIO_MASK(type),                           \
+ +              { READ,                 "READ" },                       \
+ +              { READA,                "READAHEAD" },                  \
+ +              { READ_SYNC,            "READ_SYNC" },                  \
+ +              { WRITE,                "WRITE" },                      \
+ +              { WRITE_SYNC,           "WRITE_SYNC" },                 \
+ +              { WRITE_FLUSH,          "WRITE_FLUSH" },                \
+ +              { WRITE_FUA,            "WRITE_FUA" },                  \
+ +              { WRITE_FLUSH_FUA,      "WRITE_FLUSH_FUA" })
+ +
+ +#define show_bio_extra(type)                                          \
+ +      __print_symbolic(F2FS_BIO_EXTRA_MASK(type),                     \
+ +              { REQ_META,             "(M)" },                        \
+ +              { REQ_PRIO,             "(P)" },                        \
+ +              { REQ_META | REQ_PRIO,  "(MP)" },                       \
+ +              { 0, " \b" })
   
   #define show_data_type(type)                                          \
         __print_symbolic(type,                                          \
@@@ -434,7 -421,7 +434,7 @@@ TRACE_EVENT(f2fs_truncate_partial_nodes
                 __entry->err)
   );
   
- -TRACE_EVENT_CONDITION(f2fs_readpage,
+ +TRACE_EVENT_CONDITION(f2fs_submit_page_bio,
   
         TP_PROTO(struct page *page, sector_t blkaddr, int type),
   
@@@ -459,7 -446,7 +459,7 @@@
         ),
   
         TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, "
- -              "blkaddr = 0x%llx, bio_type = %s",
+ +              "blkaddr = 0x%llx, bio_type = %s%s",
                 show_dev_ino(__entry),
                 (unsigned long)__entry->index,
                 (unsigned long long)__entry->blkaddr,
@@@ -611,54 -598,36 +611,54 @@@ TRACE_EVENT(f2fs_reserve_new_block
                 __entry->ofs_in_node)
   );
   
- -TRACE_EVENT(f2fs_do_submit_bio,
+ +DECLARE_EVENT_CLASS(f2fs__submit_bio,
   
- -      TP_PROTO(struct super_block *sb, int btype, bool sync, struct bio *bio),
+ +      TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio),
   
- -      TP_ARGS(sb, btype, sync, bio),
+ +      TP_ARGS(sb, rw, type, bio),
   
         TP_STRUCT__entry(
                 __field(dev_t,  dev)
- -              __field(int,    btype)
- -              __field(bool,   sync)
+ +              __field(int,    rw)
+ +              __field(int,    type)
                 __field(sector_t,       sector)
                 __field(unsigned int,   size)
         ),
   
         TP_fast_assign(
                 __entry->dev            = sb->s_dev;
- -              __entry->btype          = btype;
- -              __entry->sync           = sync;
+ +              __entry->rw             = rw;
+ +              __entry->type           = type;
-               __entry->sector         = bio->bi_sector;
-               __entry->size           = bio->bi_size;
+               __entry->sector         = bio->bi_iter.bi_sector;
+               __entry->size           = bio->bi_iter.bi_size;
         ),
   
- -      TP_printk("dev = (%d,%d), type = %s, io = %s, sector = %lld, size = %u",
+ +      TP_printk("dev = (%d,%d), %s%s, %s, sector = %lld, size = %u",
                 show_dev(__entry),
- -              show_block_type(__entry->btype),
- -              __entry->sync ? "sync" : "no sync",
+ +              show_bio_type(__entry->rw),
+ +              show_block_type(__entry->type),
                 (unsigned long long)__entry->sector,
                 __entry->size)
   );
   
+ +DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_write_bio,
+ +
+ +      TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio),
+ +
+ +      TP_ARGS(sb, rw, type, bio),
+ +
+ +      TP_CONDITION(bio)
+ +);
+ +
+ +DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_read_bio,
+ +
+ +      TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio),
+ +
+ +      TP_ARGS(sb, rw, type, bio),
+ +
+ +      TP_CONDITION(bio)
+ +);
+ +
   DECLARE_EVENT_CLASS(f2fs__page,
   
         TP_PROTO(struct page *page, int type),
@@@ -705,16 -674,15 +705,16 @@@ DEFINE_EVENT(f2fs__page, f2fs_vm_page_m
         TP_ARGS(page, type)
   );
   
- -TRACE_EVENT(f2fs_submit_write_page,
+ +TRACE_EVENT(f2fs_submit_page_mbio,
   
- -      TP_PROTO(struct page *page, block_t blk_addr, int type),
+ +      TP_PROTO(struct page *page, int rw, int type, block_t blk_addr),
   
- -      TP_ARGS(page, blk_addr, type),
+ +      TP_ARGS(page, rw, type, blk_addr),
   
         TP_STRUCT__entry(
                 __field(dev_t,  dev)
                 __field(ino_t,  ino)
+ +              __field(int, rw)
                 __field(int, type)
                 __field(pgoff_t, index)
                 __field(block_t, block)
@@@ -723,15 -691,13 +723,15 @@@
         TP_fast_assign(
                 __entry->dev    = page->mapping->host->i_sb->s_dev;
                 __entry->ino    = page->mapping->host->i_ino;
+ +              __entry->rw     = rw;
                 __entry->type   = type;
                 __entry->index  = page->index;
                 __entry->block  = blk_addr;
         ),
   
- -      TP_printk("dev = (%d,%d), ino = %lu, %s, index = %lu, blkaddr = 0x%llx",
+ +      TP_printk("dev = (%d,%d), ino = %lu, %s%s, %s, index = %lu, blkaddr = 0x%llx",
                 show_dev_ino(__entry),
+ +              show_bio_type(__entry->rw),
                 show_block_type(__entry->type),
                 (unsigned long)__entry->index,
                 (unsigned long long)__entry->block)
@@@ -761,29 -727,6 +761,29 @@@ TRACE_EVENT(f2fs_write_checkpoint
                 __entry->msg)
   );
   
+ +TRACE_EVENT(f2fs_issue_discard,
+ +
+ +      TP_PROTO(struct super_block *sb, block_t blkstart, block_t blklen),
+ +
+ +      TP_ARGS(sb, blkstart, blklen),
+ +
+ +      TP_STRUCT__entry(
+ +              __field(dev_t,  dev)
+ +              __field(block_t, blkstart)
+ +              __field(block_t, blklen)
+ +      ),
+ +
+ +      TP_fast_assign(
+ +              __entry->dev    = sb->s_dev;
+ +              __entry->blkstart = blkstart;
+ +              __entry->blklen = blklen;
+ +      ),
+ +
+ +      TP_printk("dev = (%d,%d), blkstart = 0x%llx, blklen = 0x%llx",
+ +              show_dev(__entry),
+ +              (unsigned long long)__entry->blkstart,
+ +              (unsigned long long)__entry->blklen)
+ +);
   #endif /* _TRACE_F2FS_H */
   
    /* This part must be outside protection */
diff --combined mm/page_io.c

index 7247be6114ac894523d8273743a4f168ceab3afa,f14eded987fac8276e3e3da5ff11d260a8ba44cf..7c59ef681381bb7afeef2cf5207d269e9a95c1f8
--- 1/mm/page_io.c
--- 2/mm/page_io.c
+++ b/mm/page_io.c
@@@ -31,13 -31,13 +31,13 @@@ static struct bio *get_swap_bio(gfp_t g
   
         bio = bio_alloc(gfp_flags, 1);
         if (bio) {
-               bio->bi_sector = map_swap_page(page, &bio->bi_bdev);
-               bio->bi_sector <<= PAGE_SHIFT - 9;
+               bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
+               bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
                 bio->bi_io_vec[0].bv_page = page;
                 bio->bi_io_vec[0].bv_len = PAGE_SIZE;
                 bio->bi_io_vec[0].bv_offset = 0;
                 bio->bi_vcnt = 1;
-               bio->bi_size = PAGE_SIZE;
+               bio->bi_iter.bi_size = PAGE_SIZE;
                 bio->bi_end_io = end_io;
         }
         return bio;
@@@ -62,7 -62,7 +62,7 @@@ void end_swap_bio_write(struct bio *bio
                 printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
                                 imajor(bio->bi_bdev->bd_inode),
                                 iminor(bio->bi_bdev->bd_inode),
-                               (unsigned long long)bio->bi_sector);
+                               (unsigned long long)bio->bi_iter.bi_sector);
                 ClearPageReclaim(page);
         }
         end_page_writeback(page);
@@@ -80,7 -80,7 +80,7 @@@ void end_swap_bio_read(struct bio *bio
                 printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
                                 imajor(bio->bi_bdev->bd_inode),
                                 iminor(bio->bi_bdev->bd_inode),
-                               (unsigned long long)bio->bi_sector);
+                               (unsigned long long)bio->bi_iter.bi_sector);
                 goto out;
         }
   
@@@ -320,8 -320,8 +320,8 @@@ int swap_readpage(struct page *page
         int ret = 0;
         struct swap_info_struct *sis = page_swap_info(page);
   
- -      VM_BUG_ON(!PageLocked(page));
- -      VM_BUG_ON(PageUptodate(page));
+ +      VM_BUG_ON_PAGE(!PageLocked(page), page);
+ +      VM_BUG_ON_PAGE(PageUptodate(page), page);
         if (frontswap_load(page) == 0) {
                 SetPageUptodate(page);
                 unlock_page(page);
diff --combined net/ceph/messenger.c

index 2ed1304d22a7dfed5c8bc9f86d5f0f5cb1b91742,18c039b95c22459f8d669ee2598c411c1b615141..0e478a0f4204b72ed19ae49c349d632cda009e02
--- 1/net/ceph/messenger.c
--- 2/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@@ -15,7 -15,6 +15,7 @@@
   #include <linux/dns_resolver.h>
   #include <net/tcp.h>
   
+ +#include <linux/ceph/ceph_features.h>
   #include <linux/ceph/libceph.h>
   #include <linux/ceph/messenger.h>
   #include <linux/ceph/decode.h>
@@@ -778,13 -777,12 +778,12 @@@ static void ceph_msg_data_bio_cursor_in
   
         bio = data->bio;
         BUG_ON(!bio);
-       BUG_ON(!bio->bi_vcnt);
   
         cursor->resid = min(length, data->bio_length);
         cursor->bio = bio;
-       cursor->vector_index = 0;
-       cursor->vector_offset = 0;
-       cursor->last_piece = length <= bio->bi_io_vec[0].bv_len;
+       cursor->bvec_iter = bio->bi_iter;
+       cursor->last_piece =
+               cursor->resid <= bio_iter_len(bio, cursor->bvec_iter);
   }
   
   static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
@@@ -793,71 -791,63 +792,63 @@@
   {
         struct ceph_msg_data *data = cursor->data;
         struct bio *bio;
-       struct bio_vec *bio_vec;
-       unsigned int index;
+       struct bio_vec bio_vec;
   
         BUG_ON(data->type != CEPH_MSG_DATA_BIO);
   
         bio = cursor->bio;
         BUG_ON(!bio);
   
-       index = cursor->vector_index;
-       BUG_ON(index >= (unsigned int) bio->bi_vcnt);
+       bio_vec = bio_iter_iovec(bio, cursor->bvec_iter);
   
-       bio_vec = &bio->bi_io_vec[index];
-       BUG_ON(cursor->vector_offset >= bio_vec->bv_len);
-       *page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset);
+       *page_offset = (size_t) bio_vec.bv_offset;
         BUG_ON(*page_offset >= PAGE_SIZE);
         if (cursor->last_piece) /* pagelist offset is always 0 */
                 *length = cursor->resid;
         else
-               *length = (size_t) (bio_vec->bv_len - cursor->vector_offset);
+               *length = (size_t) bio_vec.bv_len;
         BUG_ON(*length > cursor->resid);
         BUG_ON(*page_offset + *length > PAGE_SIZE);
   
-       return bio_vec->bv_page;
+       return bio_vec.bv_page;
   }
   
   static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
                                         size_t bytes)
   {
         struct bio *bio;
-       struct bio_vec *bio_vec;
-       unsigned int index;
+       struct bio_vec bio_vec;
   
         BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO);
   
         bio = cursor->bio;
         BUG_ON(!bio);
   
-       index = cursor->vector_index;
-       BUG_ON(index >= (unsigned int) bio->bi_vcnt);
-       bio_vec = &bio->bi_io_vec[index];
+       bio_vec = bio_iter_iovec(bio, cursor->bvec_iter);
   
         /* Advance the cursor offset */
   
         BUG_ON(cursor->resid < bytes);
         cursor->resid -= bytes;
-       cursor->vector_offset += bytes;
-       if (cursor->vector_offset < bio_vec->bv_len)
+ 
+       bio_advance_iter(bio, &cursor->bvec_iter, bytes);
+ 
+       if (bytes < bio_vec.bv_len)
                 return false;   /* more bytes to process in this segment */
-       BUG_ON(cursor->vector_offset != bio_vec->bv_len);
   
         /* Move on to the next segment, and possibly the next bio */
   
-       if (++index == (unsigned int) bio->bi_vcnt) {
+       if (!cursor->bvec_iter.bi_size) {
                 bio = bio->bi_next;
-               index = 0;
+               cursor->bvec_iter = bio->bi_iter;
         }
         cursor->bio = bio;
-       cursor->vector_index = index;
-       cursor->vector_offset = 0;
   
         if (!cursor->last_piece) {
                 BUG_ON(!cursor->resid);
                 BUG_ON(!bio);
                 /* A short read is OK, so use <= rather than == */
-               if (cursor->resid <= bio->bi_io_vec[index].bv_len)
+               if (cursor->resid <= bio_iter_len(bio, cursor->bvec_iter))
                         cursor->last_piece = true;
         }
   
@@@ -1866,9 -1856,7 +1857,9 @@@ int ceph_parse_ips(const char *c, cons
                                 port = (port * 10) + (*p - '0');
                                 p++;
                         }
- -                      if (port > 65535 || port == 0)
+ +                      if (port == 0)
+ +                              port = CEPH_MON_PORT;
+ +                      else if (port > 65535)
                                 goto bad;
                 } else {
                         port = CEPH_MON_PORT;
@@@ -1948,8 -1936,7 +1939,8 @@@ static int process_connect(struct ceph_
   {
         u64 sup_feat = con->msgr->supported_features;
         u64 req_feat = con->msgr->required_features;
- -      u64 server_feat = le64_to_cpu(con->in_reply.features);
+ +      u64 server_feat = ceph_sanitize_features(
+ +                              le64_to_cpu(con->in_reply.features));
         int ret;
   
         dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@@ -2857,8 -2844,8 +2848,8 @@@ static void con_fault(struct ceph_conne
    */
   void ceph_messenger_init(struct ceph_messenger *msgr,
                         struct ceph_entity_addr *myaddr,
- -                      u32 supported_features,
- -                      u32 required_features,
+ +                      u64 supported_features,
+ +                      u64 required_features,
                         bool nocrc)
   {
         msgr->supported_features = supported_features;
@@@ -3130,8 -3117,15 +3121,8 @@@ struct ceph_msg *ceph_msg_new(int type
         INIT_LIST_HEAD(&m->data);
   
         /* front */
- -      m->front_max = front_len;
         if (front_len) {
- -              if (front_len > PAGE_CACHE_SIZE) {
- -                      m->front.iov_base = __vmalloc(front_len, flags,
- -                                                    PAGE_KERNEL);
- -                      m->front_is_vmalloc = true;
- -              } else {
- -                      m->front.iov_base = kmalloc(front_len, flags);
- -              }
+ +              m->front.iov_base = ceph_kvmalloc(front_len, flags);
                 if (m->front.iov_base == NULL) {
                         dout("ceph_msg_new can't allocate %d bytes\n",
                              front_len);
@@@ -3140,7 -3134,7 +3131,7 @@@
         } else {
                 m->front.iov_base = NULL;
         }
- -      m->front.iov_len = front_len;
+ +      m->front_alloc_len = m->front.iov_len = front_len;
   
         dout("ceph_msg_new %p front %d\n", m, front_len);
         return m;
@@@ -3253,7 -3247,10 +3244,7 @@@ static int ceph_con_in_msg_alloc(struc
   void ceph_msg_kfree(struct ceph_msg *m)
   {
         dout("msg_kfree %p\n", m);
- -      if (m->front_is_vmalloc)
- -              vfree(m->front.iov_base);
- -      else
- -              kfree(m->front.iov_base);
+ +      ceph_kvfree(m->front.iov_base);
         kmem_cache_free(ceph_msg_cache, m);
   }
   
@@@ -3295,8 -3292,8 +3286,8 @@@ EXPORT_SYMBOL(ceph_msg_last_put)
   
   void ceph_msg_dump(struct ceph_msg *msg)
   {
- -      pr_debug("msg_dump %p (front_max %d length %zd)\n", msg,
- -               msg->front_max, msg->data_length);
+ +      pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg,
+ +               msg->front_alloc_len, msg->data_length);
         print_hex_dump(KERN_DEBUG, "header: ",
                        DUMP_PREFIX_OFFSET, 16, 1,
                        &msg->hdr, sizeof(msg->hdr), true);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 30 Jan 2014 19:19:05 +0000 (11:19 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 30 Jan 2014 19:19:05 +0000 (11:19 -0800)
		1	2
block/blk-throttle.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/rbd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/xen-blkfront.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/bcache/request.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-bufio.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-cache-policy-mq.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-cache-target.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-delay.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-snap.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-thin.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/md.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid1.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid10.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid5.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/s390/block/xpram.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/sd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/staging/lustre/lustre/llite/lloop.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/f2fs/data.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/gfs2/lops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/gfs2/ops_fstype.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_aops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_buf.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/ceph/messenger.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/trace/events/f2fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_io.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ceph/messenger.c	patch \|	diff1 \|	diff2 \|	blob \| history