]> Pileus Git - ~andy/linux/blobdiff - drivers/md/raid5.c
bcache: Fix for when no journal entries are found
[~andy/linux] / drivers / md / raid5.c
index 78ea44336e75e06d5769b4a6158f2a1e506214e0..7ff4f252ca1a42943252a9e19b975da5b2f8ce9b 100644 (file)
@@ -53,6 +53,7 @@
 #include <linux/cpu.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
+#include <linux/nodemask.h>
 #include <trace/events/block.h>
 
 #include "md.h"
 #include "raid0.h"
 #include "bitmap.h"
 
+#define cpu_to_group(cpu) cpu_to_node(cpu)
+#define ANY_GROUP NUMA_NO_NODE
+
+static struct workqueue_struct *raid5_wq;
 /*
  * Stripe cache
  */
@@ -72,6 +77,7 @@
 #define BYPASS_THRESHOLD       1
 #define NR_HASH                        (PAGE_SIZE / sizeof(struct hlist_head))
 #define HASH_MASK              (NR_HASH - 1)
+#define MAX_STRIPE_BATCH       8
 
 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
 {
@@ -200,6 +206,49 @@ static int stripe_operations_active(struct stripe_head *sh)
               test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 }
 
+static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
+{
+       struct r5conf *conf = sh->raid_conf;
+       struct r5worker_group *group;
+       int thread_cnt;
+       int i, cpu = sh->cpu;
+
+       if (!cpu_online(cpu)) {
+               cpu = cpumask_any(cpu_online_mask);
+               sh->cpu = cpu;
+       }
+
+       if (list_empty(&sh->lru)) {
+               struct r5worker_group *group;
+               group = conf->worker_groups + cpu_to_group(cpu);
+               list_add_tail(&sh->lru, &group->handle_list);
+               group->stripes_cnt++;
+               sh->group = group;
+       }
+
+       if (conf->worker_cnt_per_group == 0) {
+               md_wakeup_thread(conf->mddev->thread);
+               return;
+       }
+
+       group = conf->worker_groups + cpu_to_group(sh->cpu);
+
+       group->workers[0].working = true;
+       /* at least one worker should run to avoid race */
+       queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
+
+       thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
+       /* wakeup more workers */
+       for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
+               if (group->workers[i].working == false) {
+                       group->workers[i].working = true;
+                       queue_work_on(sh->cpu, raid5_wq,
+                                     &group->workers[i].work);
+                       thread_cnt--;
+               }
+       }
+}
+
 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
 {
        BUG_ON(!list_empty(&sh->lru));
@@ -214,7 +263,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
                else {
                        clear_bit(STRIPE_DELAYED, &sh->state);
                        clear_bit(STRIPE_BIT_DELAY, &sh->state);
-                       list_add_tail(&sh->lru, &conf->handle_list);
+                       if (conf->worker_cnt_per_group == 0) {
+                               list_add_tail(&sh->lru, &conf->handle_list);
+                       } else {
+                               raid5_wakeup_stripe_thread(sh);
+                               return;
+                       }
                }
                md_wakeup_thread(conf->mddev->thread);
        } else {
@@ -239,12 +293,62 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
                do_release_stripe(conf, sh);
 }
 
+static struct llist_node *llist_reverse_order(struct llist_node *head)
+{
+       struct llist_node *new_head = NULL;
+
+       while (head) {
+               struct llist_node *tmp = head;
+               head = head->next;
+               tmp->next = new_head;
+               new_head = tmp;
+       }
+
+       return new_head;
+}
+
+/* should hold conf->device_lock already */
+static int release_stripe_list(struct r5conf *conf)
+{
+       struct stripe_head *sh;
+       int count = 0;
+       struct llist_node *head;
+
+       head = llist_del_all(&conf->released_stripes);
+       head = llist_reverse_order(head);
+       while (head) {
+               sh = llist_entry(head, struct stripe_head, release_list);
+               head = llist_next(head);
+               /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
+               smp_mb();
+               clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
+               /*
+                * Don't worry the bit is set here, because if the bit is set
+                * again, the count is always > 1. This is true for
+                * STRIPE_ON_UNPLUG_LIST bit too.
+                */
+               __release_stripe(conf, sh);
+               count++;
+       }
+
+       return count;
+}
+
 static void release_stripe(struct stripe_head *sh)
 {
        struct r5conf *conf = sh->raid_conf;
        unsigned long flags;
+       bool wakeup;
 
+       if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
+               goto slow_path;
+       wakeup = llist_add(&sh->release_list, &conf->released_stripes);
+       if (wakeup)
+               md_wakeup_thread(conf->mddev->thread);
+       return;
+slow_path:
        local_irq_save(flags);
+       /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
        if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
                do_release_stripe(conf, sh);
                spin_unlock(&conf->device_lock);
@@ -359,6 +463,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
                raid5_build_block(sh, i, previous);
        }
        insert_hash(conf, sh);
+       sh->cpu = smp_processor_id();
 }
 
 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
@@ -491,7 +596,8 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                        if (atomic_read(&sh->count)) {
                                BUG_ON(!list_empty(&sh->lru)
                                    && !test_bit(STRIPE_EXPANDING, &sh->state)
-                                   && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state));
+                                   && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
+                                   && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
                        } else {
                                if (!test_bit(STRIPE_HANDLE, &sh->state))
                                        atomic_inc(&conf->active_stripes);
@@ -499,6 +605,10 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                                    !test_bit(STRIPE_EXPANDING, &sh->state))
                                        BUG();
                                list_del_init(&sh->lru);
+                               if (sh->group) {
+                                       sh->group->stripes_cnt--;
+                                       sh->group = NULL;
+                               }
                        }
                }
        } while (sh == NULL);
@@ -3779,6 +3889,7 @@ static void raid5_activate_delayed(struct r5conf *conf)
                        if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                                atomic_inc(&conf->preread_active_stripes);
                        list_add_tail(&sh->lru, &conf->hold_list);
+                       raid5_wakeup_stripe_thread(sh);
                }
        }
 }
@@ -4058,18 +4169,35 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
  * head of the hold_list has changed, i.e. the head was promoted to the
  * handle_list.
  */
-static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
+static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 {
-       struct stripe_head *sh;
+       struct stripe_head *sh = NULL, *tmp;
+       struct list_head *handle_list = NULL;
+       struct r5worker_group *wg = NULL;
+
+       if (conf->worker_cnt_per_group == 0) {
+               handle_list = &conf->handle_list;
+       } else if (group != ANY_GROUP) {
+               handle_list = &conf->worker_groups[group].handle_list;
+               wg = &conf->worker_groups[group];
+       } else {
+               int i;
+               for (i = 0; i < conf->group_cnt; i++) {
+                       handle_list = &conf->worker_groups[i].handle_list;
+                       wg = &conf->worker_groups[i];
+                       if (!list_empty(handle_list))
+                               break;
+               }
+       }
 
        pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
                  __func__,
-                 list_empty(&conf->handle_list) ? "empty" : "busy",
+                 list_empty(handle_list) ? "empty" : "busy",
                  list_empty(&conf->hold_list) ? "empty" : "busy",
                  atomic_read(&conf->pending_full_writes), conf->bypass_count);
 
-       if (!list_empty(&conf->handle_list)) {
-               sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
+       if (!list_empty(handle_list)) {
+               sh = list_entry(handle_list->next, typeof(*sh), lru);
 
                if (list_empty(&conf->hold_list))
                        conf->bypass_count = 0;
@@ -4087,14 +4215,32 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
                   ((conf->bypass_threshold &&
                     conf->bypass_count > conf->bypass_threshold) ||
                    atomic_read(&conf->pending_full_writes) == 0)) {
-               sh = list_entry(conf->hold_list.next,
-                               typeof(*sh), lru);
-               conf->bypass_count -= conf->bypass_threshold;
-               if (conf->bypass_count < 0)
-                       conf->bypass_count = 0;
-       } else
+
+               list_for_each_entry(tmp, &conf->hold_list,  lru) {
+                       if (conf->worker_cnt_per_group == 0 ||
+                           group == ANY_GROUP ||
+                           !cpu_online(tmp->cpu) ||
+                           cpu_to_group(tmp->cpu) == group) {
+                               sh = tmp;
+                               break;
+                       }
+               }
+
+               if (sh) {
+                       conf->bypass_count -= conf->bypass_threshold;
+                       if (conf->bypass_count < 0)
+                               conf->bypass_count = 0;
+               }
+               wg = NULL;
+       }
+
+       if (!sh)
                return NULL;
 
+       if (wg) {
+               wg->stripes_cnt--;
+               sh->group = NULL;
+       }
        list_del_init(&sh->lru);
        atomic_inc(&sh->count);
        BUG_ON(atomic_read(&sh->count) != 1);
@@ -4127,6 +4273,10 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
                         */
                        smp_mb__before_clear_bit();
                        clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
+                       /*
+                        * STRIPE_ON_RELEASE_LIST could be set here. In that
+                        * case, the count is always > 1 here
+                        */
                        __release_stripe(conf, sh);
                        cnt++;
                }
@@ -4286,8 +4436,10 @@ static void make_request(struct mddev *mddev, struct bio * bi)
        for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
                DEFINE_WAIT(w);
                int previous;
+               int seq;
 
        retry:
+               seq = read_seqcount_begin(&conf->gen_lock);
                previous = 0;
                prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
                if (unlikely(conf->reshape_progress != MaxSector)) {
@@ -4320,7 +4472,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
                                                  previous,
                                                  &dd_idx, NULL);
                pr_debug("raid456: make_request, sector %llu logical %llu\n",
-                       (unsigned long long)new_sector, 
+                       (unsigned long long)new_sector,
                        (unsigned long long)logical_sector);
 
                sh = get_active_stripe(conf, new_sector, previous,
@@ -4349,6 +4501,13 @@ static void make_request(struct mddev *mddev, struct bio * bi)
                                        goto retry;
                                }
                        }
+                       if (read_seqcount_retry(&conf->gen_lock, seq)) {
+                               /* Might have got the wrong stripe_head
+                                * by accident
+                                */
+                               release_stripe(sh);
+                               goto retry;
+                       }
 
                        if (rw == WRITE &&
                            logical_sector >= mddev->suspend_lo &&
@@ -4788,14 +4947,14 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
        return handled;
 }
 
-#define MAX_STRIPE_BATCH 8
-static int handle_active_stripes(struct r5conf *conf)
+static int handle_active_stripes(struct r5conf *conf, int group,
+                                struct r5worker *worker)
 {
        struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
        int i, batch_size = 0;
 
        while (batch_size < MAX_STRIPE_BATCH &&
-                       (sh = __get_priority_stripe(conf)) != NULL)
+                       (sh = __get_priority_stripe(conf, group)) != NULL)
                batch[batch_size++] = sh;
 
        if (batch_size == 0)
@@ -4813,6 +4972,39 @@ static int handle_active_stripes(struct r5conf *conf)
        return batch_size;
 }
 
+static void raid5_do_work(struct work_struct *work)
+{
+       struct r5worker *worker = container_of(work, struct r5worker, work);
+       struct r5worker_group *group = worker->group;
+       struct r5conf *conf = group->conf;
+       int group_id = group - conf->worker_groups;
+       int handled;
+       struct blk_plug plug;
+
+       pr_debug("+++ raid5worker active\n");
+
+       blk_start_plug(&plug);
+       handled = 0;
+       spin_lock_irq(&conf->device_lock);
+       while (1) {
+               int batch_size, released;
+
+               released = release_stripe_list(conf);
+
+               batch_size = handle_active_stripes(conf, group_id, worker);
+               worker->working = false;
+               if (!batch_size && !released)
+                       break;
+               handled += batch_size;
+       }
+       pr_debug("%d stripes handled\n", handled);
+
+       spin_unlock_irq(&conf->device_lock);
+       blk_finish_plug(&plug);
+
+       pr_debug("--- raid5worker inactive\n");
+}
+
 /*
  * This is our raid5 kernel thread.
  *
@@ -4836,7 +5028,9 @@ static void raid5d(struct md_thread *thread)
        spin_lock_irq(&conf->device_lock);
        while (1) {
                struct bio *bio;
-               int batch_size;
+               int batch_size, released;
+
+               released = release_stripe_list(conf);
 
                if (
                    !list_empty(&conf->bitmap_list)) {
@@ -4860,8 +5054,8 @@ static void raid5d(struct md_thread *thread)
                        handled++;
                }
 
-               batch_size = handle_active_stripes(conf);
-               if (!batch_size)
+               batch_size = handle_active_stripes(conf, ANY_GROUP, NULL);
+               if (!batch_size && !released)
                        break;
                handled += batch_size;
 
@@ -4989,10 +5183,70 @@ stripe_cache_active_show(struct mddev *mddev, char *page)
 static struct md_sysfs_entry
 raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
 
+static ssize_t
+raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
+{
+       struct r5conf *conf = mddev->private;
+       if (conf)
+               return sprintf(page, "%d\n", conf->worker_cnt_per_group);
+       else
+               return 0;
+}
+
+static int alloc_thread_groups(struct r5conf *conf, int cnt);
+static ssize_t
+raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
+{
+       struct r5conf *conf = mddev->private;
+       unsigned long new;
+       int err;
+       struct r5worker_group *old_groups;
+       int old_group_cnt;
+
+       if (len >= PAGE_SIZE)
+               return -EINVAL;
+       if (!conf)
+               return -ENODEV;
+
+       if (kstrtoul(page, 10, &new))
+               return -EINVAL;
+
+       if (new == conf->worker_cnt_per_group)
+               return len;
+
+       mddev_suspend(mddev);
+
+       old_groups = conf->worker_groups;
+       old_group_cnt = conf->worker_cnt_per_group;
+
+       conf->worker_groups = NULL;
+       err = alloc_thread_groups(conf, new);
+       if (err) {
+               conf->worker_groups = old_groups;
+               conf->worker_cnt_per_group = old_group_cnt;
+       } else {
+               if (old_groups)
+                       kfree(old_groups[0].workers);
+               kfree(old_groups);
+       }
+
+       mddev_resume(mddev);
+
+       if (err)
+               return err;
+       return len;
+}
+
+static struct md_sysfs_entry
+raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
+                               raid5_show_group_thread_cnt,
+                               raid5_store_group_thread_cnt);
+
 static struct attribute *raid5_attrs[] =  {
        &raid5_stripecache_size.attr,
        &raid5_stripecache_active.attr,
        &raid5_preread_bypass_threshold.attr,
+       &raid5_group_thread_cnt.attr,
        NULL,
 };
 static struct attribute_group raid5_attrs_group = {
@@ -5000,6 +5254,54 @@ static struct attribute_group raid5_attrs_group = {
        .attrs = raid5_attrs,
 };
 
+static int alloc_thread_groups(struct r5conf *conf, int cnt)
+{
+       int i, j;
+       ssize_t size;
+       struct r5worker *workers;
+
+       conf->worker_cnt_per_group = cnt;
+       if (cnt == 0) {
+               conf->worker_groups = NULL;
+               return 0;
+       }
+       conf->group_cnt = num_possible_nodes();
+       size = sizeof(struct r5worker) * cnt;
+       workers = kzalloc(size * conf->group_cnt, GFP_NOIO);
+       conf->worker_groups = kzalloc(sizeof(struct r5worker_group) *
+                               conf->group_cnt, GFP_NOIO);
+       if (!conf->worker_groups || !workers) {
+               kfree(workers);
+               kfree(conf->worker_groups);
+               conf->worker_groups = NULL;
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < conf->group_cnt; i++) {
+               struct r5worker_group *group;
+
+               group = &conf->worker_groups[i];
+               INIT_LIST_HEAD(&group->handle_list);
+               group->conf = conf;
+               group->workers = workers + i * cnt;
+
+               for (j = 0; j < cnt; j++) {
+                       group->workers[j].group = group;
+                       INIT_WORK(&group->workers[j].work, raid5_do_work);
+               }
+       }
+
+       return 0;
+}
+
+static void free_thread_groups(struct r5conf *conf)
+{
+       if (conf->worker_groups)
+               kfree(conf->worker_groups[0].workers);
+       kfree(conf->worker_groups);
+       conf->worker_groups = NULL;
+}
+
 static sector_t
 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 {
@@ -5040,6 +5342,7 @@ static void raid5_free_percpu(struct r5conf *conf)
 
 static void free_conf(struct r5conf *conf)
 {
+       free_thread_groups(conf);
        shrink_stripes(conf);
        raid5_free_percpu(conf);
        kfree(conf->disks);
@@ -5168,7 +5471,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
        if (conf == NULL)
                goto abort;
+       /* Don't enable multi-threading by default*/
+       if (alloc_thread_groups(conf, 0))
+               goto abort;
        spin_lock_init(&conf->device_lock);
+       seqcount_init(&conf->gen_lock);
        init_waitqueue_head(&conf->wait_for_stripe);
        init_waitqueue_head(&conf->wait_for_overlap);
        INIT_LIST_HEAD(&conf->handle_list);
@@ -5176,6 +5483,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        INIT_LIST_HEAD(&conf->delayed_list);
        INIT_LIST_HEAD(&conf->bitmap_list);
        INIT_LIST_HEAD(&conf->inactive_list);
+       init_llist_head(&conf->released_stripes);
        atomic_set(&conf->active_stripes, 0);
        atomic_set(&conf->preread_active_stripes, 0);
        atomic_set(&conf->active_aligned_reads, 0);
@@ -5980,6 +6288,7 @@ static int raid5_start_reshape(struct mddev *mddev)
 
        atomic_set(&conf->reshape_stripes, 0);
        spin_lock_irq(&conf->device_lock);
+       write_seqcount_begin(&conf->gen_lock);
        conf->previous_raid_disks = conf->raid_disks;
        conf->raid_disks += mddev->delta_disks;
        conf->prev_chunk_sectors = conf->chunk_sectors;
@@ -5996,8 +6305,16 @@ static int raid5_start_reshape(struct mddev *mddev)
        else
                conf->reshape_progress = 0;
        conf->reshape_safe = conf->reshape_progress;
+       write_seqcount_end(&conf->gen_lock);
        spin_unlock_irq(&conf->device_lock);
 
+       /* Now make sure any requests that proceeded on the assumption
+        * the reshape wasn't running - like Discard or Read - have
+        * completed.
+        */
+       mddev_suspend(mddev);
+       mddev_resume(mddev);
+
        /* Add some new drives, as many as will fit.
         * We know there are enough to make the newly sized array work.
         * Don't add devices if we are reducing the number of
@@ -6472,6 +6789,10 @@ static struct md_personality raid4_personality =
 
 static int __init raid5_init(void)
 {
+       raid5_wq = alloc_workqueue("raid5wq",
+               WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
+       if (!raid5_wq)
+               return -ENOMEM;
        register_md_personality(&raid6_personality);
        register_md_personality(&raid5_personality);
        register_md_personality(&raid4_personality);
@@ -6483,6 +6804,7 @@ static void raid5_exit(void)
        unregister_md_personality(&raid6_personality);
        unregister_md_personality(&raid5_personality);
        unregister_md_personality(&raid4_personality);
+       destroy_workqueue(raid5_wq);
 }
 
 module_init(raid5_init);