Merge branch 'nfsd-next' of git://linux-nfs.org/~bfields/linux

[~andy/linux] / drivers / md / dm-thin.c
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c

index 7e84baccf0ad6c90f7a3f62ec084007003319b21..be70d38745f7a7f5da51bd4ba83a29afe851b238 100644 (file)
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -130,10 +130,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
  struct dm_thin_new_mapping;
  
  /*
- * The pool runs in 3 modes.  Ordered in degraded order for comparisons.
+ * The pool runs in 4 modes.  Ordered in degraded order for comparisons.
   */
  enum pool_mode {
         PM_WRITE,               /* metadata may be changed */
+       PM_OUT_OF_DATA_SPACE,   /* metadata may be changed, though data may not be allocated */
         PM_READ_ONLY,           /* metadata may not be changed */
         PM_FAIL,                /* all I/O fails */
  };
@@ -198,7 +199,6 @@ struct pool {
  };
  
  static enum pool_mode get_pool_mode(struct pool *pool);
-static void out_of_data_space(struct pool *pool);
  static void metadata_operation_failed(struct pool *pool, const char *op, int r);
  
  /*
@@ -226,6 +226,7 @@ struct thin_c {
  
         struct pool *pool;
         struct dm_thin_device *td;
+       bool requeue_mode:1;
  };
  
  /*----------------------------------------------------------------*/
@@ -369,14 +370,18 @@ struct dm_thin_endio_hook {
         struct dm_thin_new_mapping *overwrite_mapping;
  };
  
-static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
+static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
  {
         struct bio *bio;
         struct bio_list bios;
+       unsigned long flags;
  
         bio_list_init(&bios);
+
+       spin_lock_irqsave(&tc->pool->lock, flags);
         bio_list_merge(&bios, master);
         bio_list_init(master);
+       spin_unlock_irqrestore(&tc->pool->lock, flags);
  
         while ((bio = bio_list_pop(&bios))) {
                 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -391,12 +396,26 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
  static void requeue_io(struct thin_c *tc)
  {
         struct pool *pool = tc->pool;
+
+       requeue_bio_list(tc, &pool->deferred_bios);
+       requeue_bio_list(tc, &pool->retry_on_resume_list);
+}
+
+static void error_retry_list(struct pool *pool)
+{
+       struct bio *bio;
         unsigned long flags;
+       struct bio_list bios;
+
+       bio_list_init(&bios);
  
         spin_lock_irqsave(&pool->lock, flags);
-       __requeue_bio_list(tc, &pool->deferred_bios);
-       __requeue_bio_list(tc, &pool->retry_on_resume_list);
+       bio_list_merge(&bios, &pool->retry_on_resume_list);
+       bio_list_init(&pool->retry_on_resume_list);
         spin_unlock_irqrestore(&pool->lock, flags);
+
+       while ((bio = bio_list_pop(&bios)))
+               bio_io_error(bio);
  }
  
  /*
@@ -925,13 +944,15 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
         }
  }
  
+static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
+
  static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
  {
         int r;
         dm_block_t free_blocks;
         struct pool *pool = tc->pool;
  
-       if (get_pool_mode(pool) != PM_WRITE)
+       if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
                 return -EINVAL;
  
         r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
@@ -958,7 +979,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
                 }
  
                 if (!free_blocks) {
-                       out_of_data_space(pool);
+                       set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
                         return -ENOSPC;
                 }
         }
@@ -988,15 +1009,32 @@ static void retry_on_resume(struct bio *bio)
         spin_unlock_irqrestore(&pool->lock, flags);
  }
  
-static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
+static bool should_error_unserviceable_bio(struct pool *pool)
  {
-       /*
-        * When pool is read-only, no cell locking is needed because
-        * nothing is changing.
-        */
-       WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY);
+       enum pool_mode m = get_pool_mode(pool);
+
+       switch (m) {
+       case PM_WRITE:
+               /* Shouldn't get here */
+               DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
+               return true;
+
+       case PM_OUT_OF_DATA_SPACE:
+               return pool->pf.error_if_no_space;
  
-       if (pool->pf.error_if_no_space)
+       case PM_READ_ONLY:
+       case PM_FAIL:
+               return true;
+       default:
+               /* Shouldn't get here */
+               DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
+               return true;
+       }
+}
+
+static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
+{
+       if (should_error_unserviceable_bio(pool))
                 bio_io_error(bio);
         else
                 retry_on_resume(bio);
@@ -1007,11 +1045,20 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
         struct bio *bio;
         struct bio_list bios;
  
+       if (should_error_unserviceable_bio(pool)) {
+               cell_error(pool, cell);
+               return;
+       }
+
         bio_list_init(&bios);
         cell_release(pool, cell, &bios);
  
-       while ((bio = bio_list_pop(&bios)))
-               handle_unserviceable_bio(pool, bio);
+       if (should_error_unserviceable_bio(pool))
+               while ((bio = bio_list_pop(&bios)))
+                       bio_io_error(bio);
+       else
+               while ((bio = bio_list_pop(&bios)))
+                       retry_on_resume(bio);
  }
  
  static void process_discard(struct thin_c *tc, struct bio *bio)
@@ -1296,6 +1343,11 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
         }
  }
  
+static void process_bio_success(struct thin_c *tc, struct bio *bio)
+{
+       bio_endio(bio, 0);
+}
+
  static void process_bio_fail(struct thin_c *tc, struct bio *bio)
  {
         bio_io_error(bio);
@@ -1328,6 +1380,11 @@ static void process_deferred_bios(struct pool *pool)
                 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
                 struct thin_c *tc = h->tc;
  
+               if (tc->requeue_mode) {
+                       bio_endio(bio, DM_ENDIO_REQUEUE);
+                       continue;
+               }
+
                 /*
                  * If we've got no free new_mapping structs, and processing
                  * this bio might require one, we pause until there are some
@@ -1394,51 +1451,134 @@ static void do_waker(struct work_struct *ws)
  
  /*----------------------------------------------------------------*/
  
+struct noflush_work {
+       struct work_struct worker;
+       struct thin_c *tc;
+
+       atomic_t complete;
+       wait_queue_head_t wait;
+};
+
+static void complete_noflush_work(struct noflush_work *w)
+{
+       atomic_set(&w->complete, 1);
+       wake_up(&w->wait);
+}
+
+static void do_noflush_start(struct work_struct *ws)
+{
+       struct noflush_work *w = container_of(ws, struct noflush_work, worker);
+       w->tc->requeue_mode = true;
+       requeue_io(w->tc);
+       complete_noflush_work(w);
+}
+
+static void do_noflush_stop(struct work_struct *ws)
+{
+       struct noflush_work *w = container_of(ws, struct noflush_work, worker);
+       w->tc->requeue_mode = false;
+       complete_noflush_work(w);
+}
+
+static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
+{
+       struct noflush_work w;
+
+       INIT_WORK(&w.worker, fn);
+       w.tc = tc;
+       atomic_set(&w.complete, 0);
+       init_waitqueue_head(&w.wait);
+
+       queue_work(tc->pool->wq, &w.worker);
+
+       wait_event(w.wait, atomic_read(&w.complete));
+}
+
+/*----------------------------------------------------------------*/
+
  static enum pool_mode get_pool_mode(struct pool *pool)
  {
         return pool->pf.mode;
  }
  
+static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
+{
+       dm_table_event(pool->ti->table);
+       DMINFO("%s: switching pool to %s mode",
+              dm_device_name(pool->pool_md), new_mode);
+}
+
  static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
  {
-       int r;
-       enum pool_mode old_mode = pool->pf.mode;
+       struct pool_c *pt = pool->ti->private;
+       bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
+       enum pool_mode old_mode = get_pool_mode(pool);
+
+       /*
+        * Never allow the pool to transition to PM_WRITE mode if user
+        * intervention is required to verify metadata and data consistency.
+        */
+       if (new_mode == PM_WRITE && needs_check) {
+               DMERR("%s: unable to switch pool to write mode until repaired.",
+                     dm_device_name(pool->pool_md));
+               if (old_mode != new_mode)
+                       new_mode = old_mode;
+               else
+                       new_mode = PM_READ_ONLY;
+       }
+       /*
+        * If we were in PM_FAIL mode, rollback of metadata failed.  We're
+        * not going to recover without a thin_repair.  So we never let the
+        * pool move out of the old mode.
+        */
+       if (old_mode == PM_FAIL)
+               new_mode = old_mode;
  
         switch (new_mode) {
         case PM_FAIL:
                 if (old_mode != new_mode)
-                       DMERR("%s: switching pool to failure mode",
-                             dm_device_name(pool->pool_md));
+                       notify_of_pool_mode_change(pool, "failure");
                 dm_pool_metadata_read_only(pool->pmd);
                 pool->process_bio = process_bio_fail;
                 pool->process_discard = process_bio_fail;
                 pool->process_prepared_mapping = process_prepared_mapping_fail;
                 pool->process_prepared_discard = process_prepared_discard_fail;
+
+               error_retry_list(pool);
                 break;
  
         case PM_READ_ONLY:
                 if (old_mode != new_mode)
-                       DMERR("%s: switching pool to read-only mode",
-                             dm_device_name(pool->pool_md));
-               r = dm_pool_abort_metadata(pool->pmd);
-               if (r) {
-                       DMERR("%s: aborting transaction failed",
-                             dm_device_name(pool->pool_md));
-                       new_mode = PM_FAIL;
-                       set_pool_mode(pool, new_mode);
-               } else {
-                       dm_pool_metadata_read_only(pool->pmd);
-                       pool->process_bio = process_bio_read_only;
-                       pool->process_discard = process_discard;
-                       pool->process_prepared_mapping = process_prepared_mapping_fail;
-                       pool->process_prepared_discard = process_prepared_discard_passdown;
-               }
+                       notify_of_pool_mode_change(pool, "read-only");
+               dm_pool_metadata_read_only(pool->pmd);
+               pool->process_bio = process_bio_read_only;
+               pool->process_discard = process_bio_success;
+               pool->process_prepared_mapping = process_prepared_mapping_fail;
+               pool->process_prepared_discard = process_prepared_discard_passdown;
+
+               error_retry_list(pool);
+               break;
+
+       case PM_OUT_OF_DATA_SPACE:
+               /*
+                * Ideally we'd never hit this state; the low water mark
+                * would trigger userland to extend the pool before we
+                * completely run out of data space.  However, many small
+                * IOs to unprovisioned space can consume data space at an
+                * alarming rate.  Adjust your low water mark if you're
+                * frequently seeing this mode.
+                */
+               if (old_mode != new_mode)
+                       notify_of_pool_mode_change(pool, "out-of-data-space");
+               pool->process_bio = process_bio_read_only;
+               pool->process_discard = process_discard;
+               pool->process_prepared_mapping = process_prepared_mapping;
+               pool->process_prepared_discard = process_prepared_discard_passdown;
                 break;
  
         case PM_WRITE:
                 if (old_mode != new_mode)
-                       DMINFO("%s: switching pool to write mode",
-                              dm_device_name(pool->pool_md));
+                       notify_of_pool_mode_change(pool, "write");
                 dm_pool_metadata_read_write(pool->pmd);
                 pool->process_bio = process_bio;
                 pool->process_discard = process_discard;
@@ -1448,32 +1588,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
         }
  
         pool->pf.mode = new_mode;
+       /*
+        * The pool mode may have changed, sync it so bind_control_target()
+        * doesn't cause an unexpected mode transition on resume.
+        */
+       pt->adjusted_pf.mode = new_mode;
  }
  
-/*
- * Rather than calling set_pool_mode directly, use these which describe the
- * reason for mode degradation.
- */
-static void out_of_data_space(struct pool *pool)
+static void abort_transaction(struct pool *pool)
  {
-       DMERR_LIMIT("%s: no free data space available.",
-                   dm_device_name(pool->pool_md));
-       set_pool_mode(pool, PM_READ_ONLY);
+       const char *dev_name = dm_device_name(pool->pool_md);
+
+       DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
+       if (dm_pool_abort_metadata(pool->pmd)) {
+               DMERR("%s: failed to abort metadata transaction", dev_name);
+               set_pool_mode(pool, PM_FAIL);
+       }
+
+       if (dm_pool_metadata_set_needs_check(pool->pmd)) {
+               DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
+               set_pool_mode(pool, PM_FAIL);
+       }
  }
  
  static void metadata_operation_failed(struct pool *pool, const char *op, int r)
  {
-       dm_block_t free_blocks;
-
         DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
                     dm_device_name(pool->pool_md), op, r);
  
-       if (r == -ENOSPC &&
-           !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
-           !free_blocks)
-               DMERR_LIMIT("%s: no free metadata space available.",
-                           dm_device_name(pool->pool_md));
-
+       abort_transaction(pool);
         set_pool_mode(pool, PM_READ_ONLY);
  }
  
@@ -1524,6 +1667,11 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
  
         thin_hook_bio(tc, bio);
  
+       if (tc->requeue_mode) {
+               bio_endio(bio, DM_ENDIO_REQUEUE);
+               return DM_MAPIO_SUBMITTED;
+       }
+
         if (get_pool_mode(tc->pool) == PM_FAIL) {
                 bio_io_error(bio);
                 return DM_MAPIO_SUBMITTED;
@@ -1687,7 +1835,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
         /*
          * We want to make sure that a pool in PM_FAIL mode is never upgraded.
          */
-       enum pool_mode old_mode = pool->pf.mode;
+       enum pool_mode old_mode = get_pool_mode(pool);
         enum pool_mode new_mode = pt->adjusted_pf.mode;
  
         /*
@@ -1701,16 +1849,6 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
         pool->pf = pt->adjusted_pf;
         pool->low_water_blocks = pt->low_water_blocks;
  
-       /*
-        * If we were in PM_FAIL mode, rollback of metadata failed.  We're
-        * not going to recover without a thin_repair.  So we never let the
-        * pool move out of the old mode.  On the other hand a PM_READ_ONLY
-        * may have been due to a lack of metadata or data space, and may
-        * now work (ie. if the underlying devices have been resized).
-        */
-       if (old_mode == PM_FAIL)
-               new_mode = old_mode;
-
         set_pool_mode(pool, new_mode);
  
         return 0;
@@ -2253,6 +2391,12 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
                 return -EINVAL;
  
         } else if (data_size > sb_data_size) {
+               if (dm_pool_metadata_needs_check(pool->pmd)) {
+                       DMERR("%s: unable to grow the data device until repaired.",
+                             dm_device_name(pool->pool_md));
+                       return 0;
+               }
+
                 if (sb_data_size)
                         DMINFO("%s: growing the data device from %llu to %llu blocks",
                                dm_device_name(pool->pool_md),
@@ -2294,6 +2438,12 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
                 return -EINVAL;
  
         } else if (metadata_dev_size > sb_metadata_dev_size) {
+               if (dm_pool_metadata_needs_check(pool->pmd)) {
+                       DMERR("%s: unable to grow the metadata device until repaired.",
+                             dm_device_name(pool->pool_md));
+                       return 0;
+               }
+
                 warn_if_metadata_device_too_big(pool->md_dev);
                 DMINFO("%s: growing the metadata device from %llu to %llu blocks",
                        dm_device_name(pool->pool_md),
@@ -2681,7 +2831,9 @@ static void pool_status(struct dm_target *ti, status_type_t type,
                 else
                         DMEMIT("- ");
  
-               if (pool->pf.mode == PM_READ_ONLY)
+               if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
+                       DMEMIT("out_of_data_space ");
+               else if (pool->pf.mode == PM_READ_ONLY)
                         DMEMIT("ro ");
                 else
                         DMEMIT("rw ");
@@ -2795,7 +2947,7 @@ static struct target_type pool_target = {
         .name = "thin-pool",
         .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
                     DM_TARGET_IMMUTABLE,
-       .version = {1, 10, 0},
+       .version = {1, 11, 0},
         .module = THIS_MODULE,
         .ctr = pool_ctr,
         .dtr = pool_dtr,
@@ -2997,10 +3149,23 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
         return 0;
  }
  
-static void thin_postsuspend(struct dm_target *ti)
+static void thin_presuspend(struct dm_target *ti)
  {
+       struct thin_c *tc = ti->private;
+
         if (dm_noflush_suspending(ti))
-               requeue_io((struct thin_c *)ti->private);
+               noflush_work(tc, do_noflush_start);
+}
+
+static void thin_postsuspend(struct dm_target *ti)
+{
+       struct thin_c *tc = ti->private;
+
+       /*
+        * The dm_noflush_suspending flag has been cleared by now, so
+        * unfortunately we must always run this.
+        */
+       noflush_work(tc, do_noflush_stop);
  }
  
  /*
@@ -3085,12 +3250,13 @@ static int thin_iterate_devices(struct dm_target *ti,
  
  static struct target_type thin_target = {
         .name = "thin",
-       .version = {1, 10, 0},
+       .version = {1, 11, 0},
         .module = THIS_MODULE,
         .ctr = thin_ctr,
         .dtr = thin_dtr,
         .map = thin_map,
         .end_io = thin_endio,
+       .presuspend = thin_presuspend,
         .postsuspend = thin_postsuspend,
         .status = thin_status,
         .iterate_devices = thin_iterate_devices,