Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wirel...

[~andy/linux] / drivers / md / raid1.c
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c

index fb96c0c2db40e1a52ed49e1ee21ac027060952b6..611b5f79761826f8843ede2384c47ccbd0a87be1 100644 (file)
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -497,11 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
         const sector_t this_sector = r1_bio->sector;
         int sectors;
         int best_good_sectors;
-       int best_disk;
+       int best_disk, best_dist_disk, best_pending_disk;
+       int has_nonrot_disk;
         int disk;
         sector_t best_dist;
+       unsigned int min_pending;
         struct md_rdev *rdev;
         int choose_first;
+       int choose_next_idle;
  
         rcu_read_lock();
         /*
@@ -512,8 +515,13 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
   retry:
         sectors = r1_bio->sectors;
         best_disk = -1;
+       best_dist_disk = -1;
         best_dist = MaxSector;
+       best_pending_disk = -1;
+       min_pending = UINT_MAX;
         best_good_sectors = 0;
+       has_nonrot_disk = 0;
+       choose_next_idle = 0;
  
         if (conf->mddev->recovery_cp < MaxSector &&
             (this_sector + sectors >= conf->next_resync))
@@ -525,6 +533,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
                 sector_t dist;
                 sector_t first_bad;
                 int bad_sectors;
+               unsigned int pending;
+               bool nonrot;
  
                 rdev = rcu_dereference(conf->mirrors[disk].rdev);
                 if (r1_bio->bios[disk] == IO_BLOCKED
@@ -583,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
                 } else
                         best_good_sectors = sectors;
  
+               nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
+               has_nonrot_disk |= nonrot;
+               pending = atomic_read(&rdev->nr_pending);
                 dist = abs(this_sector - conf->mirrors[disk].head_position);
-               if (choose_first
-                   /* Don't change to another disk for sequential reads */
-                   || conf->mirrors[disk].next_seq_sect == this_sector
-                   || dist == 0
-                   /* If device is idle, use it */
-                   || atomic_read(&rdev->nr_pending) == 0) {
+               if (choose_first) {
                         best_disk = disk;
                         break;
                 }
+               /* Don't change to another disk for sequential reads */
+               if (conf->mirrors[disk].next_seq_sect == this_sector
+                   || dist == 0) {
+                       int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
+                       struct raid1_info *mirror = &conf->mirrors[disk];
+
+                       best_disk = disk;
+                       /*
+                        * If buffered sequential IO size exceeds optimal
+                        * iosize, check if there is idle disk. If yes, choose
+                        * the idle disk. read_balance could already choose an
+                        * idle disk before noticing it's a sequential IO in
+                        * this disk. This doesn't matter because this disk
+                        * will idle, next time it will be utilized after the
+                        * first disk has IO size exceeds optimal iosize. In
+                        * this way, iosize of the first disk will be optimal
+                        * iosize at least. iosize of the second disk might be
+                        * small, but not a big deal since when the second disk
+                        * starts IO, the first disk is likely still busy.
+                        */
+                       if (nonrot && opt_iosize > 0 &&
+                           mirror->seq_start != MaxSector &&
+                           mirror->next_seq_sect > opt_iosize &&
+                           mirror->next_seq_sect - opt_iosize >=
+                           mirror->seq_start) {
+                               choose_next_idle = 1;
+                               continue;
+                       }
+                       break;
+               }
+               /* If device is idle, use it */
+               if (pending == 0) {
+                       best_disk = disk;
+                       break;
+               }
+
+               if (choose_next_idle)
+                       continue;
+
+               if (min_pending > pending) {
+                       min_pending = pending;
+                       best_pending_disk = disk;
+               }
+
                 if (dist < best_dist) {
                         best_dist = dist;
-                       best_disk = disk;
+                       best_dist_disk = disk;
                 }
         }
  
+       /*
+        * If all disks are rotational, choose the closest disk. If any disk is
+        * non-rotational, choose the disk with less pending request even the
+        * disk is rotational, which might/might not be optimal for raids with
+        * mixed ratation/non-rotational disks depending on workload.
+        */
+       if (best_disk == -1) {
+               if (has_nonrot_disk)
+                       best_disk = best_pending_disk;
+               else
+                       best_disk = best_dist_disk;
+       }
+
         if (best_disk >= 0) {
                 rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
                 if (!rdev)
@@ -612,6 +677,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
                         goto retry;
                 }
                 sectors = best_good_sectors;
+
+               if (conf->mirrors[best_disk].next_seq_sect != this_sector)
+                       conf->mirrors[best_disk].seq_start = this_sector;
+
                 conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
         }
         rcu_read_unlock();
@@ -875,6 +944,44 @@ do_sync_io:
         pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
  }
  
+struct raid1_plug_cb {
+       struct blk_plug_cb      cb;
+       struct bio_list         pending;
+       int                     pending_cnt;
+};
+
+static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+       struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb,
+                                                 cb);
+       struct mddev *mddev = plug->cb.data;
+       struct r1conf *conf = mddev->private;
+       struct bio *bio;
+
+       if (from_schedule) {
+               spin_lock_irq(&conf->device_lock);
+               bio_list_merge(&conf->pending_bio_list, &plug->pending);
+               conf->pending_count += plug->pending_cnt;
+               spin_unlock_irq(&conf->device_lock);
+               md_wakeup_thread(mddev->thread);
+               kfree(plug);
+               return;
+       }
+
+       /* we aren't scheduling, so we can do the write-out directly. */
+       bio = bio_list_get(&plug->pending);
+       bitmap_unplug(mddev->bitmap);
+       wake_up(&conf->wait_barrier);
+
+       while (bio) { /* submit pending writes */
+               struct bio *next = bio->bi_next;
+               bio->bi_next = NULL;
+               generic_make_request(bio);
+               bio = next;
+       }
+       kfree(plug);
+}
+
  static void make_request(struct mddev *mddev, struct bio * bio)
  {
         struct r1conf *conf = mddev->private;
@@ -888,6 +995,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
         const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
         const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
         struct md_rdev *blocked_rdev;
+       struct blk_plug_cb *cb;
+       struct raid1_plug_cb *plug = NULL;
         int first_clone;
         int sectors_handled;
         int max_sectors;
@@ -1190,11 +1299,22 @@ read_again:
                 mbio->bi_private = r1_bio;
  
                 atomic_inc(&r1_bio->remaining);
+
+               cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
+               if (cb)
+                       plug = container_of(cb, struct raid1_plug_cb, cb);
+               else
+                       plug = NULL;
                 spin_lock_irqsave(&conf->device_lock, flags);
-               bio_list_add(&conf->pending_bio_list, mbio);
-               conf->pending_count++;
+               if (plug) {
+                       bio_list_add(&plug->pending, mbio);
+                       plug->pending_cnt++;
+               } else {
+                       bio_list_add(&conf->pending_bio_list, mbio);
+                       conf->pending_count++;
+               }
                 spin_unlock_irqrestore(&conf->device_lock, flags);
-               if (!mddev_check_plugged(mddev))
+               if (!plug)
                         md_wakeup_thread(mddev->thread);
         }
         /* Mustn't call r1_bio_write_done before this next test,
@@ -2178,8 +2298,7 @@ static void raid1d(struct mddev *mddev)
         blk_start_plug(&plug);
         for (;;) {
  
-               if (atomic_read(&mddev->plug_cnt) == 0)
-                       flush_pending_writes(conf);
+               flush_pending_writes(conf);
  
                 spin_lock_irqsave(&conf->device_lock, flags);
                 if (list_empty(head)) {
@@ -2376,6 +2495,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
                                 bio->bi_rw = READ;
                                 bio->bi_end_io = end_sync_read;
                                 read_targets++;
+                       } else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
+                               test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+                               !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
+                               /*
+                                * The device is suitable for reading (InSync),
+                                * but has bad block(s) here. Let's try to correct them,
+                                * if we are doing resync or repair. Otherwise, leave
+                                * this device alone for this sync request.
+                                */
+                               bio->bi_rw = WRITE;
+                               bio->bi_end_io = end_sync_write;
+                               write_targets++;
                         }
                 }
                 if (bio->bi_end_io) {
@@ -2433,7 +2564,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
                 /* There is nowhere to write, so all non-sync
                  * drives must be failed - so we are finished
                  */
-               sector_t rv = max_sector - sector_nr;
+               sector_t rv;
+               if (min_bad > 0)
+                       max_sector = sector_nr + min_bad;
+               rv = max_sector - sector_nr;
                 *skipped = 1;
                 put_buf(r1_bio);
                 return rv;
@@ -2577,6 +2711,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
                         mddev->merge_check_needed = 1;
  
                 disk->head_position = 0;
+               disk->seq_start = MaxSector;
         }
         conf->raid_disks = mddev->raid_disks;
         conf->mddev = mddev;