]> Pileus Git - ~andy/linux/blobdiff - drivers/md/raid5.c
raid5: relieve lock contention in get_active_stripe()
[~andy/linux] / drivers / md / raid5.c
index 4bbcb7e26d128e5a6507d90b998bb3b086cc40c6..93090b2afab482afcf83b32322188ab8b141c340 100644 (file)
@@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
        return &conf->stripe_hashtbl[hash];
 }
 
+static inline int stripe_hash_locks_hash(sector_t sect)
+{
+       return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
+}
+
+static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
+{
+       spin_lock_irq(conf->hash_locks + hash);
+       spin_lock(&conf->device_lock);
+}
+
+static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
+{
+       spin_unlock(&conf->device_lock);
+       spin_unlock_irq(conf->hash_locks + hash);
+}
+
+static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+       int i;
+       local_irq_disable();
+       spin_lock(conf->hash_locks);
+       for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+               spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
+       spin_lock(&conf->device_lock);
+}
+
+static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+       int i;
+       spin_unlock(&conf->device_lock);
+       for (i = NR_STRIPE_HASH_LOCKS; i; i--)
+               spin_unlock(conf->hash_locks + i - 1);
+       local_irq_enable();
+}
+
 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
  * order without overlap.  There may be several bio's per stripe+device, and
  * a bio could span several devices.
@@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
        }
 }
 
-static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
+                             struct list_head *temp_inactive_list)
 {
        BUG_ON(!list_empty(&sh->lru));
        BUG_ON(atomic_read(&conf->active_stripes)==0);
@@ -278,19 +315,60 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
                            < IO_THRESHOLD)
                                md_wakeup_thread(conf->mddev->thread);
                atomic_dec(&conf->active_stripes);
-               if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
-                       list_add_tail(&sh->lru, &conf->inactive_list);
-                       wake_up(&conf->wait_for_stripe);
-                       if (conf->retry_read_aligned)
-                               md_wakeup_thread(conf->mddev->thread);
-               }
+               if (!test_bit(STRIPE_EXPANDING, &sh->state))
+                       list_add_tail(&sh->lru, temp_inactive_list);
        }
 }
 
-static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
+                            struct list_head *temp_inactive_list)
 {
        if (atomic_dec_and_test(&sh->count))
-               do_release_stripe(conf, sh);
+               do_release_stripe(conf, sh, temp_inactive_list);
+}
+
+/*
+ * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
+ *
+ * Be careful: Only one task can add/delete stripes from temp_inactive_list at
+ * given time. Adding stripes only takes device lock, while deleting stripes
+ * only takes hash lock.
+ */
+static void release_inactive_stripe_list(struct r5conf *conf,
+                                        struct list_head *temp_inactive_list,
+                                        int hash)
+{
+       int size;
+       bool do_wakeup = false;
+       unsigned long flags;
+
+       if (hash == NR_STRIPE_HASH_LOCKS) {
+               size = NR_STRIPE_HASH_LOCKS;
+               hash = NR_STRIPE_HASH_LOCKS - 1;
+       } else
+               size = 1;
+       while (size) {
+               struct list_head *list = &temp_inactive_list[size - 1];
+
+               /*
+                * We don't hold any lock here yet, get_active_stripe() might
+                * remove stripes from the list
+                */
+               if (!list_empty_careful(list)) {
+                       spin_lock_irqsave(conf->hash_locks + hash, flags);
+                       list_splice_tail_init(list, conf->inactive_list + hash);
+                       do_wakeup = true;
+                       spin_unlock_irqrestore(conf->hash_locks + hash, flags);
+               }
+               size--;
+               hash--;
+       }
+
+       if (do_wakeup) {
+               wake_up(&conf->wait_for_stripe);
+               if (conf->retry_read_aligned)
+                       md_wakeup_thread(conf->mddev->thread);
+       }
 }
 
 static struct llist_node *llist_reverse_order(struct llist_node *head)
@@ -308,7 +386,8 @@ static struct llist_node *llist_reverse_order(struct llist_node *head)
 }
 
 /* should hold conf->device_lock already */
-static int release_stripe_list(struct r5conf *conf)
+static int release_stripe_list(struct r5conf *conf,
+                              struct list_head *temp_inactive_list)
 {
        struct stripe_head *sh;
        int count = 0;
@@ -317,6 +396,8 @@ static int release_stripe_list(struct r5conf *conf)
        head = llist_del_all(&conf->released_stripes);
        head = llist_reverse_order(head);
        while (head) {
+               int hash;
+
                sh = llist_entry(head, struct stripe_head, release_list);
                head = llist_next(head);
                /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
@@ -327,7 +408,8 @@ static int release_stripe_list(struct r5conf *conf)
                 * again, the count is always > 1. This is true for
                 * STRIPE_ON_UNPLUG_LIST bit too.
                 */
-               __release_stripe(conf, sh);
+               hash = sh->hash_lock_index;
+               __release_stripe(conf, sh, &temp_inactive_list[hash]);
                count++;
        }
 
@@ -338,6 +420,8 @@ static void release_stripe(struct stripe_head *sh)
 {
        struct r5conf *conf = sh->raid_conf;
        unsigned long flags;
+       struct list_head list;
+       int hash;
        bool wakeup;
 
        if (unlikely(!conf->mddev->thread) ||
@@ -351,8 +435,11 @@ slow_path:
        local_irq_save(flags);
        /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
        if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
-               do_release_stripe(conf, sh);
+               INIT_LIST_HEAD(&list);
+               hash = sh->hash_lock_index;
+               do_release_stripe(conf, sh, &list);
                spin_unlock(&conf->device_lock);
+               release_inactive_stripe_list(conf, &list, hash);
        }
        local_irq_restore(flags);
 }
@@ -377,18 +464,19 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
 
 
 /* find an idle stripe, make sure it is unhashed, and return it. */
-static struct stripe_head *get_free_stripe(struct r5conf *conf)
+static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
 {
        struct stripe_head *sh = NULL;
        struct list_head *first;
 
-       if (list_empty(&conf->inactive_list))
+       if (list_empty(conf->inactive_list + hash))
                goto out;
-       first = conf->inactive_list.next;
+       first = (conf->inactive_list + hash)->next;
        sh = list_entry(first, struct stripe_head, lru);
        list_del_init(first);
        remove_hash(sh);
        atomic_inc(&conf->active_stripes);
+       BUG_ON(hash != sh->hash_lock_index);
 out:
        return sh;
 }
@@ -431,7 +519,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 {
        struct r5conf *conf = sh->raid_conf;
-       int i;
+       int i, seq;
 
        BUG_ON(atomic_read(&sh->count) != 0);
        BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
@@ -441,7 +529,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
                (unsigned long long)sh->sector);
 
        remove_hash(sh);
-
+retry:
+       seq = read_seqcount_begin(&conf->gen_lock);
        sh->generation = conf->generation - previous;
        sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
        sh->sector = sector;
@@ -463,6 +552,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
                dev->flags = 0;
                raid5_build_block(sh, i, previous);
        }
+       if (read_seqcount_retry(&conf->gen_lock, seq))
+               goto retry;
        insert_hash(conf, sh);
        sh->cpu = smp_processor_id();
 }
@@ -567,29 +658,31 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                  int previous, int noblock, int noquiesce)
 {
        struct stripe_head *sh;
+       int hash = stripe_hash_locks_hash(sector);
 
        pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
 
-       spin_lock_irq(&conf->device_lock);
+       spin_lock_irq(conf->hash_locks + hash);
 
        do {
                wait_event_lock_irq(conf->wait_for_stripe,
                                    conf->quiesce == 0 || noquiesce,
-                                   conf->device_lock);
+                                   *(conf->hash_locks + hash));
                sh = __find_stripe(conf, sector, conf->generation - previous);
                if (!sh) {
                        if (!conf->inactive_blocked)
-                               sh = get_free_stripe(conf);
+                               sh = get_free_stripe(conf, hash);
                        if (noblock && sh == NULL)
                                break;
                        if (!sh) {
                                conf->inactive_blocked = 1;
-                               wait_event_lock_irq(conf->wait_for_stripe,
-                                                   !list_empty(&conf->inactive_list) &&
-                                                   (atomic_read(&conf->active_stripes)
-                                                    < (conf->max_nr_stripes *3/4)
-                                                    || !conf->inactive_blocked),
-                                                   conf->device_lock);
+                               wait_event_lock_irq(
+                                       conf->wait_for_stripe,
+                                       !list_empty(conf->inactive_list + hash) &&
+                                       (atomic_read(&conf->active_stripes)
+                                        < (conf->max_nr_stripes * 3 / 4)
+                                        || !conf->inactive_blocked),
+                                       *(conf->hash_locks + hash));
                                conf->inactive_blocked = 0;
                        } else
                                init_stripe(sh, sector, previous);
@@ -600,9 +693,11 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                                    && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
                                    && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
                        } else {
+                               spin_lock(&conf->device_lock);
                                if (!test_bit(STRIPE_HANDLE, &sh->state))
                                        atomic_inc(&conf->active_stripes);
                                if (list_empty(&sh->lru) &&
+                                   !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state) &&
                                    !test_bit(STRIPE_EXPANDING, &sh->state))
                                        BUG();
                                list_del_init(&sh->lru);
@@ -610,6 +705,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                                        sh->group->stripes_cnt--;
                                        sh->group = NULL;
                                }
+                               spin_unlock(&conf->device_lock);
                        }
                }
        } while (sh == NULL);
@@ -617,7 +713,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
        if (sh)
                atomic_inc(&sh->count);
 
-       spin_unlock_irq(&conf->device_lock);
+       spin_unlock_irq(conf->hash_locks + hash);
        return sh;
 }
 
@@ -1597,7 +1693,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
        put_cpu();
 }
 
-static int grow_one_stripe(struct r5conf *conf)
+static int grow_one_stripe(struct r5conf *conf, int hash)
 {
        struct stripe_head *sh;
        sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
@@ -1613,6 +1709,7 @@ static int grow_one_stripe(struct r5conf *conf)
                kmem_cache_free(conf->slab_cache, sh);
                return 0;
        }
+       sh->hash_lock_index = hash;
        /* we just created an active stripe so... */
        atomic_set(&sh->count, 1);
        atomic_inc(&conf->active_stripes);
@@ -1625,6 +1722,7 @@ static int grow_stripes(struct r5conf *conf, int num)
 {
        struct kmem_cache *sc;
        int devs = max(conf->raid_disks, conf->previous_raid_disks);
+       int hash;
 
        if (conf->mddev->gendisk)
                sprintf(conf->cache_name[0],
@@ -1642,9 +1740,13 @@ static int grow_stripes(struct r5conf *conf, int num)
                return 1;
        conf->slab_cache = sc;
        conf->pool_size = devs;
-       while (num--)
-               if (!grow_one_stripe(conf))
+       hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
+       while (num--) {
+               if (!grow_one_stripe(conf, hash))
                        return 1;
+               conf->max_nr_stripes++;
+               hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
+       }
        return 0;
 }
 
@@ -1702,6 +1804,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        int err;
        struct kmem_cache *sc;
        int i;
+       int hash, cnt;
 
        if (newsize <= conf->pool_size)
                return 0; /* never bother to shrink */
@@ -1741,19 +1844,29 @@ static int resize_stripes(struct r5conf *conf, int newsize)
         * OK, we have enough stripes, start collecting inactive
         * stripes and copying them over
         */
+       hash = 0;
+       cnt = 0;
        list_for_each_entry(nsh, &newstripes, lru) {
-               spin_lock_irq(&conf->device_lock);
-               wait_event_lock_irq(conf->wait_for_stripe,
-                                   !list_empty(&conf->inactive_list),
-                                   conf->device_lock);
-               osh = get_free_stripe(conf);
-               spin_unlock_irq(&conf->device_lock);
+               lock_device_hash_lock(conf, hash);
+               wait_event_cmd(conf->wait_for_stripe,
+                                   !list_empty(conf->inactive_list + hash),
+                                   unlock_device_hash_lock(conf, hash),
+                                   lock_device_hash_lock(conf, hash));
+               osh = get_free_stripe(conf, hash);
+               unlock_device_hash_lock(conf, hash);
                atomic_set(&nsh->count, 1);
                for(i=0; i<conf->pool_size; i++)
                        nsh->dev[i].page = osh->dev[i].page;
                for( ; i<newsize; i++)
                        nsh->dev[i].page = NULL;
+               nsh->hash_lock_index = hash;
                kmem_cache_free(conf->slab_cache, osh);
+               cnt++;
+               if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
+                   !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
+                       hash++;
+                       cnt = 0;
+               }
        }
        kmem_cache_destroy(conf->slab_cache);
 
@@ -1812,13 +1925,13 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        return err;
 }
 
-static int drop_one_stripe(struct r5conf *conf)
+static int drop_one_stripe(struct r5conf *conf, int hash)
 {
        struct stripe_head *sh;
 
-       spin_lock_irq(&conf->device_lock);
-       sh = get_free_stripe(conf);
-       spin_unlock_irq(&conf->device_lock);
+       spin_lock_irq(conf->hash_locks + hash);
+       sh = get_free_stripe(conf, hash);
+       spin_unlock_irq(conf->hash_locks + hash);
        if (!sh)
                return 0;
        BUG_ON(atomic_read(&sh->count));
@@ -1830,8 +1943,10 @@ static int drop_one_stripe(struct r5conf *conf)
 
 static void shrink_stripes(struct r5conf *conf)
 {
-       while (drop_one_stripe(conf))
-               ;
+       int hash;
+       for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
+               while (drop_one_stripe(conf, hash))
+                       ;
 
        if (conf->slab_cache)
                kmem_cache_destroy(conf->slab_cache);
@@ -3915,7 +4030,8 @@ static void raid5_activate_delayed(struct r5conf *conf)
        }
 }
 
-static void activate_bit_delay(struct r5conf *conf)
+static void activate_bit_delay(struct r5conf *conf,
+       struct list_head *temp_inactive_list)
 {
        /* device_lock is held */
        struct list_head head;
@@ -3923,9 +4039,11 @@ static void activate_bit_delay(struct r5conf *conf)
        list_del_init(&conf->bitmap_list);
        while (!list_empty(&head)) {
                struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
+               int hash;
                list_del_init(&sh->lru);
                atomic_inc(&sh->count);
-               __release_stripe(conf, sh);
+               hash = sh->hash_lock_index;
+               __release_stripe(conf, sh, &temp_inactive_list[hash]);
        }
 }
 
@@ -3941,7 +4059,7 @@ int md_raid5_congested(struct mddev *mddev, int bits)
                return 1;
        if (conf->quiesce)
                return 1;
-       if (list_empty_careful(&conf->inactive_list))
+       if (atomic_read(&conf->active_stripes) == conf->max_nr_stripes)
                return 1;
 
        return 0;
@@ -4271,6 +4389,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 struct raid5_plug_cb {
        struct blk_plug_cb      cb;
        struct list_head        list;
+       struct list_head        temp_inactive_list[NR_STRIPE_HASH_LOCKS];
 };
 
 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
@@ -4281,6 +4400,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
        struct mddev *mddev = cb->cb.data;
        struct r5conf *conf = mddev->private;
        int cnt = 0;
+       int hash;
 
        if (cb->list.next && !list_empty(&cb->list)) {
                spin_lock_irq(&conf->device_lock);
@@ -4298,11 +4418,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
                         * STRIPE_ON_RELEASE_LIST could be set here. In that
                         * case, the count is always > 1 here
                         */
-                       __release_stripe(conf, sh);
+                       hash = sh->hash_lock_index;
+                       __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
                        cnt++;
                }
                spin_unlock_irq(&conf->device_lock);
        }
+       release_inactive_stripe_list(conf, cb->temp_inactive_list,
+                                    NR_STRIPE_HASH_LOCKS);
        if (mddev->queue)
                trace_block_unplug(mddev->queue, cnt, !from_schedule);
        kfree(cb);
@@ -4323,8 +4446,12 @@ static void release_stripe_plug(struct mddev *mddev,
 
        cb = container_of(blk_cb, struct raid5_plug_cb, cb);
 
-       if (cb->list.next == NULL)
+       if (cb->list.next == NULL) {
+               int i;
                INIT_LIST_HEAD(&cb->list);
+               for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+                       INIT_LIST_HEAD(cb->temp_inactive_list + i);
+       }
 
        if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
                list_add_tail(&sh->lru, &cb->list);
@@ -4969,27 +5096,45 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 }
 
 static int handle_active_stripes(struct r5conf *conf, int group,
-                                struct r5worker *worker)
+                                struct r5worker *worker,
+                                struct list_head *temp_inactive_list)
 {
        struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
-       int i, batch_size = 0;
+       int i, batch_size = 0, hash;
+       bool release_inactive = false;
 
        while (batch_size < MAX_STRIPE_BATCH &&
                        (sh = __get_priority_stripe(conf, group)) != NULL)
                batch[batch_size++] = sh;
 
-       if (batch_size == 0)
-               return batch_size;
+       if (batch_size == 0) {
+               for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+                       if (!list_empty(temp_inactive_list + i))
+                               break;
+               if (i == NR_STRIPE_HASH_LOCKS)
+                       return batch_size;
+               release_inactive = true;
+       }
        spin_unlock_irq(&conf->device_lock);
 
+       release_inactive_stripe_list(conf, temp_inactive_list,
+                                    NR_STRIPE_HASH_LOCKS);
+
+       if (release_inactive) {
+               spin_lock_irq(&conf->device_lock);
+               return 0;
+       }
+
        for (i = 0; i < batch_size; i++)
                handle_stripe(batch[i]);
 
        cond_resched();
 
        spin_lock_irq(&conf->device_lock);
-       for (i = 0; i < batch_size; i++)
-               __release_stripe(conf, batch[i]);
+       for (i = 0; i < batch_size; i++) {
+               hash = batch[i]->hash_lock_index;
+               __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
+       }
        return batch_size;
 }
 
@@ -5010,9 +5155,10 @@ static void raid5_do_work(struct work_struct *work)
        while (1) {
                int batch_size, released;
 
-               released = release_stripe_list(conf);
+               released = release_stripe_list(conf, worker->temp_inactive_list);
 
-               batch_size = handle_active_stripes(conf, group_id, worker);
+               batch_size = handle_active_stripes(conf, group_id, worker,
+                                                  worker->temp_inactive_list);
                worker->working = false;
                if (!batch_size && !released)
                        break;
@@ -5051,7 +5197,7 @@ static void raid5d(struct md_thread *thread)
                struct bio *bio;
                int batch_size, released;
 
-               released = release_stripe_list(conf);
+               released = release_stripe_list(conf, conf->temp_inactive_list);
 
                if (
                    !list_empty(&conf->bitmap_list)) {
@@ -5061,7 +5207,7 @@ static void raid5d(struct md_thread *thread)
                        bitmap_unplug(mddev->bitmap);
                        spin_lock_irq(&conf->device_lock);
                        conf->seq_write = conf->seq_flush;
-                       activate_bit_delay(conf);
+                       activate_bit_delay(conf, conf->temp_inactive_list);
                }
                raid5_activate_delayed(conf);
 
@@ -5075,7 +5221,8 @@ static void raid5d(struct md_thread *thread)
                        handled++;
                }
 
-               batch_size = handle_active_stripes(conf, ANY_GROUP, NULL);
+               batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
+                                                  conf->temp_inactive_list);
                if (!batch_size && !released)
                        break;
                handled += batch_size;
@@ -5111,22 +5258,29 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 {
        struct r5conf *conf = mddev->private;
        int err;
+       int hash;
 
        if (size <= 16 || size > 32768)
                return -EINVAL;
+       hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
        while (size < conf->max_nr_stripes) {
-               if (drop_one_stripe(conf))
+               if (drop_one_stripe(conf, hash))
                        conf->max_nr_stripes--;
                else
                        break;
+               hash--;
+               if (hash < 0)
+                       hash = NR_STRIPE_HASH_LOCKS - 1;
        }
        err = md_allow_write(mddev);
        if (err)
                return err;
+       hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
        while (size > conf->max_nr_stripes) {
-               if (grow_one_stripe(conf))
+               if (grow_one_stripe(conf, hash))
                        conf->max_nr_stripes++;
                else break;
+               hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
        }
        return 0;
 }
@@ -5277,7 +5431,7 @@ static struct attribute_group raid5_attrs_group = {
 
 static int alloc_thread_groups(struct r5conf *conf, int cnt)
 {
-       int i, j;
+       int i, j, k;
        ssize_t size;
        struct r5worker *workers;
 
@@ -5307,8 +5461,12 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt)
                group->workers = workers + i * cnt;
 
                for (j = 0; j < cnt; j++) {
-                       group->workers[j].group = group;
-                       INIT_WORK(&group->workers[j].work, raid5_do_work);
+                       struct r5worker *worker = group->workers + j;
+                       worker->group = group;
+                       INIT_WORK(&worker->work, raid5_do_work);
+
+                       for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
+                               INIT_LIST_HEAD(worker->temp_inactive_list + k);
                }
        }
 
@@ -5459,6 +5617,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        struct md_rdev *rdev;
        struct disk_info *disk;
        char pers_name[6];
+       int i;
 
        if (mddev->new_level != 5
            && mddev->new_level != 4
@@ -5503,7 +5662,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        INIT_LIST_HEAD(&conf->hold_list);
        INIT_LIST_HEAD(&conf->delayed_list);
        INIT_LIST_HEAD(&conf->bitmap_list);
-       INIT_LIST_HEAD(&conf->inactive_list);
        init_llist_head(&conf->released_stripes);
        atomic_set(&conf->active_stripes, 0);
        atomic_set(&conf->preread_active_stripes, 0);
@@ -5529,6 +5687,21 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
                goto abort;
 
+       /* We init hash_locks[0] separately to that it can be used
+        * as the reference lock in the spin_lock_nest_lock() call
+        * in lock_all_device_hash_locks_irq in order to convince
+        * lockdep that we know what we are doing.
+        */
+       spin_lock_init(conf->hash_locks);
+       for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+               spin_lock_init(conf->hash_locks + i);
+
+       for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+               INIT_LIST_HEAD(conf->inactive_list + i);
+
+       for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+               INIT_LIST_HEAD(conf->temp_inactive_list + i);
+
        conf->level = mddev->new_level;
        if (raid5_alloc_percpu(conf) != 0)
                goto abort;
@@ -5569,7 +5742,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        else
                conf->max_degraded = 1;
        conf->algorithm = mddev->new_layout;
-       conf->max_nr_stripes = NR_STRIPES;
        conf->reshape_progress = mddev->reshape_position;
        if (conf->reshape_progress != MaxSector) {
                conf->prev_chunk_sectors = mddev->chunk_sectors;
@@ -5578,7 +5750,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 
        memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
                 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
-       if (grow_stripes(conf, conf->max_nr_stripes)) {
+       if (grow_stripes(conf, NR_STRIPES)) {
                printk(KERN_ERR
                       "md/raid:%s: couldn't allocate %dkB for buffers\n",
                       mdname(mddev), memory);
@@ -6483,27 +6655,28 @@ static void raid5_quiesce(struct mddev *mddev, int state)
                break;
 
        case 1: /* stop all writes */
-               spin_lock_irq(&conf->device_lock);
+               lock_all_device_hash_locks_irq(conf);
                /* '2' tells resync/reshape to pause so that all
                 * active stripes can drain
                 */
                conf->quiesce = 2;
-               wait_event_lock_irq(conf->wait_for_stripe,
+               wait_event_cmd(conf->wait_for_stripe,
                                    atomic_read(&conf->active_stripes) == 0 &&
                                    atomic_read(&conf->active_aligned_reads) == 0,
-                                   conf->device_lock);
+                                   unlock_all_device_hash_locks_irq(conf),
+                                   lock_all_device_hash_locks_irq(conf));
                conf->quiesce = 1;
-               spin_unlock_irq(&conf->device_lock);
+               unlock_all_device_hash_locks_irq(conf);
                /* allow reshape to continue */
                wake_up(&conf->wait_for_overlap);
                break;
 
        case 0: /* re-enable writes */
-               spin_lock_irq(&conf->device_lock);
+               lock_all_device_hash_locks_irq(conf);
                conf->quiesce = 0;
                wake_up(&conf->wait_for_stripe);
                wake_up(&conf->wait_for_overlap);
-               spin_unlock_irq(&conf->device_lock);
+               unlock_all_device_hash_locks_irq(conf);
                break;
        }
 }