]> Pileus Git - ~andy/linux/commitdiff
Merge branch 'drbd-8.4_ed6' into for-3.8-drivers-drbd-8.4_ed6
authorPhilipp Reisner <philipp.reisner@linbit.com>
Fri, 9 Nov 2012 13:18:43 +0000 (14:18 +0100)
committerPhilipp Reisner <philipp.reisner@linbit.com>
Fri, 9 Nov 2012 13:20:23 +0000 (14:20 +0100)
1  2 
drivers/block/drbd/drbd_bitmap.c
drivers/block/drbd/drbd_int.h
drivers/block/drbd/drbd_interval.c
drivers/block/drbd/drbd_main.c
drivers/block/drbd/drbd_nl.c
drivers/block/drbd/drbd_receiver.c
drivers/block/drbd/drbd_req.c
include/linux/idr.h

index 8d8069758042601f5a02c060a265e43e3d1401d8,e30ff720894f78abb636fb9404ffe05ad28bb77f..1ab205a4bf69c1b3e1ea733ebf716bd1bd83c31a
@@@ -119,13 -119,9 +119,9 @@@ static void __bm_print_lock_info(struc
        if (!__ratelimit(&drbd_ratelimit_state))
                return;
        dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
-           current == mdev->receiver.task ? "receiver" :
-           current == mdev->asender.task  ? "asender"  :
-           current == mdev->worker.task   ? "worker"   : current->comm,
-           func, b->bm_why ?: "?",
-           b->bm_task == mdev->receiver.task ? "receiver" :
-           b->bm_task == mdev->asender.task  ? "asender"  :
-           b->bm_task == mdev->worker.task   ? "worker"   : "?");
+               drbd_task_to_thread_name(mdev->tconn, current),
+               func, b->bm_why ?: "?",
+               drbd_task_to_thread_name(mdev->tconn, b->bm_task));
  }
  
  void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
  
        if (trylock_failed) {
                dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
-                   current == mdev->receiver.task ? "receiver" :
-                   current == mdev->asender.task  ? "asender"  :
-                   current == mdev->worker.task   ? "worker"   : current->comm,
-                   why, b->bm_why ?: "?",
-                   b->bm_task == mdev->receiver.task ? "receiver" :
-                   b->bm_task == mdev->asender.task  ? "asender"  :
-                   b->bm_task == mdev->worker.task   ? "worker"   : "?");
+                        drbd_task_to_thread_name(mdev->tconn, current),
+                        why, b->bm_why ?: "?",
+                        drbd_task_to_thread_name(mdev->tconn, b->bm_task));
                mutex_lock(&b->bm_change);
        }
        if (BM_LOCKED_MASK & b->bm_flags)
@@@ -196,6 -188,9 +188,9 @@@ void drbd_bm_unlock(struct drbd_conf *m
  /* to mark for lazy writeout once syncer cleared all clearable bits,
   * we if bits have been cleared since last IO. */
  #define BM_PAGE_LAZY_WRITEOUT 28
+ /* pages marked with this "HINT" will be considered for writeout
+  * on activity log transactions */
+ #define BM_PAGE_HINT_WRITEOUT 27
  
  /* store_page_idx uses non-atomic assignment. It is only used directly after
   * allocating the page.  All other bm_set_page_* and bm_clear_page_* need to
@@@ -227,8 -222,7 +222,7 @@@ static void bm_page_unlock_io(struct dr
  {
        struct drbd_bitmap *b = mdev->bitmap;
        void *addr = &page_private(b->bm_pages[page_nr]);
-       clear_bit(BM_PAGE_IO_LOCK, addr);
-       smp_mb__after_clear_bit();
+       clear_bit_unlock(BM_PAGE_IO_LOCK, addr);
        wake_up(&mdev->bitmap->bm_io_wait);
  }
  
@@@ -246,6 -240,27 +240,27 @@@ static void bm_set_page_need_writeout(s
        set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
  }
  
+ /**
+  * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
+  * @mdev:     DRBD device.
+  * @page_nr:  the bitmap page to mark with the "hint" flag
+  *
+  * From within an activity log transaction, we mark a few pages with these
+  * hints, then call drbd_bm_write_hinted(), which will only write out changed
+  * pages which are flagged with this mark.
+  */
+ void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr)
+ {
+       struct page *page;
+       if (page_nr >= mdev->bitmap->bm_number_of_pages) {
+               dev_warn(DEV, "BAD: page_nr: %u, number_of_pages: %u\n",
+                        page_nr, (int)mdev->bitmap->bm_number_of_pages);
+               return;
+       }
+       page = mdev->bitmap->bm_pages[page_nr];
+       set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page));
+ }
  static int bm_test_page_unchanged(struct page *page)
  {
        volatile const unsigned long *addr = &page_private(page);
@@@ -289,25 -304,25 +304,25 @@@ static unsigned int bm_bit_to_page_idx(
        return page_nr;
  }
  
 -static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km)
 +static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
  {
        struct page *page = b->bm_pages[idx];
 -      return (unsigned long *) kmap_atomic(page, km);
 +      return (unsigned long *) kmap_atomic(page);
  }
  
  static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
  {
 -      return __bm_map_pidx(b, idx, KM_IRQ1);
 +      return __bm_map_pidx(b, idx);
  }
  
 -static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
 +static void __bm_unmap(unsigned long *p_addr)
  {
 -      kunmap_atomic(p_addr, km);
 +      kunmap_atomic(p_addr);
  };
  
  static void bm_unmap(unsigned long *p_addr)
  {
 -      return __bm_unmap(p_addr, KM_IRQ1);
 +      return __bm_unmap(p_addr);
  }
  
  /* long word offset of _bitmap_ sector */
@@@ -376,18 -391,19 +391,18 @@@ static struct page **bm_realloc_pages(s
         * GFP_NOIO, as this is called while drbd IO is "suspended",
         * and during resize or attach on diskless Primary,
         * we must not block on IO to ourselves.
-        * Context is receiver thread or cqueue thread/dmsetup.  */
+        * Context is receiver thread or dmsetup. */
        bytes = sizeof(struct page *)*want;
 -      new_pages = kmalloc(bytes, GFP_NOIO);
 +      new_pages = kzalloc(bytes, GFP_NOIO);
        if (!new_pages) {
                new_pages = __vmalloc(bytes,
 -                              GFP_NOIO | __GFP_HIGHMEM,
 +                              GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO,
                                PAGE_KERNEL);
                if (!new_pages)
                        return NULL;
                vmalloced = 1;
        }
  
 -      memset(new_pages, 0, bytes);
        if (want >= have) {
                for (i = 0; i < have; i++)
                        new_pages[i] = old_pages[i];
@@@ -441,7 -457,8 +456,8 @@@ int drbd_bm_init(struct drbd_conf *mdev
  
  sector_t drbd_bm_capacity(struct drbd_conf *mdev)
  {
-       ERR_IF(!mdev->bitmap) return 0;
+       if (!expect(mdev->bitmap))
+               return 0;
        return mdev->bitmap->bm_dev_capacity;
  }
  
   */
  void drbd_bm_cleanup(struct drbd_conf *mdev)
  {
-       ERR_IF (!mdev->bitmap) return;
+       if (!expect(mdev->bitmap))
+               return;
        bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
        bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags));
        kfree(mdev->bitmap);
@@@ -545,15 -563,15 +562,15 @@@ static unsigned long bm_count_bits(stru
  
        /* all but last page */
        for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
 -              p_addr = __bm_map_pidx(b, idx, KM_USER0);
 +              p_addr = __bm_map_pidx(b, idx);
                for (i = 0; i < LWPP; i++)
                        bits += hweight_long(p_addr[i]);
 -              __bm_unmap(p_addr, KM_USER0);
 +              __bm_unmap(p_addr);
                cond_resched();
        }
        /* last (or only) page */
        last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
 -      p_addr = __bm_map_pidx(b, idx, KM_USER0);
 +      p_addr = __bm_map_pidx(b, idx);
        for (i = 0; i < last_word; i++)
                bits += hweight_long(p_addr[i]);
        p_addr[last_word] &= cpu_to_lel(mask);
        /* 32bit arch, may have an unused padding long */
        if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
                p_addr[last_word+1] = 0;
 -      __bm_unmap(p_addr, KM_USER0);
 +      __bm_unmap(p_addr);
        return bits;
  }
  
@@@ -612,7 -630,8 +629,8 @@@ int drbd_bm_resize(struct drbd_conf *md
        int err = 0, growing;
        int opages_vmalloced;
  
-       ERR_IF(!b) return -ENOMEM;
+       if (!expect(b))
+               return -ENOMEM;
  
        drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK);
  
@@@ -734,8 -753,10 +752,10 @@@ unsigned long _drbd_bm_total_weight(str
        unsigned long s;
        unsigned long flags;
  
-       ERR_IF(!b) return 0;
-       ERR_IF(!b->bm_pages) return 0;
+       if (!expect(b))
+               return 0;
+       if (!expect(b->bm_pages))
+               return 0;
  
        spin_lock_irqsave(&b->bm_lock, flags);
        s = b->bm_set;
@@@ -758,8 -779,10 +778,10 @@@ unsigned long drbd_bm_total_weight(stru
  size_t drbd_bm_words(struct drbd_conf *mdev)
  {
        struct drbd_bitmap *b = mdev->bitmap;
-       ERR_IF(!b) return 0;
-       ERR_IF(!b->bm_pages) return 0;
+       if (!expect(b))
+               return 0;
+       if (!expect(b->bm_pages))
+               return 0;
  
        return b->bm_words;
  }
  unsigned long drbd_bm_bits(struct drbd_conf *mdev)
  {
        struct drbd_bitmap *b = mdev->bitmap;
-       ERR_IF(!b) return 0;
+       if (!expect(b))
+               return 0;
  
        return b->bm_bits;
  }
@@@ -788,8 -812,10 +811,10 @@@ void drbd_bm_merge_lel(struct drbd_con
  
        end = offset + number;
  
-       ERR_IF(!b) return;
-       ERR_IF(!b->bm_pages) return;
+       if (!expect(b))
+               return;
+       if (!expect(b->bm_pages))
+               return;
        if (number == 0)
                return;
        WARN_ON(offset >= b->bm_words);
@@@ -833,8 -859,10 +858,10 @@@ void drbd_bm_get_lel(struct drbd_conf *
  
        end = offset + number;
  
-       ERR_IF(!b) return;
-       ERR_IF(!b->bm_pages) return;
+       if (!expect(b))
+               return;
+       if (!expect(b->bm_pages))
+               return;
  
        spin_lock_irq(&b->bm_lock);
        if ((offset >= b->bm_words) ||
  void drbd_bm_set_all(struct drbd_conf *mdev)
  {
        struct drbd_bitmap *b = mdev->bitmap;
-       ERR_IF(!b) return;
-       ERR_IF(!b->bm_pages) return;
+       if (!expect(b))
+               return;
+       if (!expect(b->bm_pages))
+               return;
  
        spin_lock_irq(&b->bm_lock);
        bm_memset(b, 0, 0xff, b->bm_words);
  void drbd_bm_clear_all(struct drbd_conf *mdev)
  {
        struct drbd_bitmap *b = mdev->bitmap;
-       ERR_IF(!b) return;
-       ERR_IF(!b->bm_pages) return;
+       if (!expect(b))
+               return;
+       if (!expect(b->bm_pages))
+               return;
  
        spin_lock_irq(&b->bm_lock);
        bm_memset(b, 0, 0, b->bm_words);
@@@ -891,7 -923,8 +922,8 @@@ struct bm_aio_ctx 
        unsigned int done;
        unsigned flags;
  #define BM_AIO_COPY_PAGES     1
- #define BM_WRITE_ALL_PAGES    2
+ #define BM_AIO_WRITE_HINTED   2
+ #define BM_WRITE_ALL_PAGES    4
        int error;
        struct kref kref;
  };
@@@ -981,11 -1014,11 +1013,11 @@@ static void bm_page_io_async(struct bm_
        if (ctx->flags & BM_AIO_COPY_PAGES) {
                void *src, *dest;
                page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
 -              dest = kmap_atomic(page, KM_USER0);
 -              src = kmap_atomic(b->bm_pages[page_nr], KM_USER1);
 +              dest = kmap_atomic(page);
 +              src = kmap_atomic(b->bm_pages[page_nr]);
                memcpy(dest, src, PAGE_SIZE);
 -              kunmap_atomic(src, KM_USER1);
 -              kunmap_atomic(dest, KM_USER0);
 +              kunmap_atomic(src);
 +              kunmap_atomic(dest);
                bm_store_page_idx(page, page_nr);
        } else
                page = b->bm_pages[page_nr];
@@@ -1062,6 -1095,11 +1094,11 @@@ static int bm_rw(struct drbd_conf *mdev
                if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
                        break;
                if (rw & WRITE) {
+                       if ((flags & BM_AIO_WRITE_HINTED) &&
+                           !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
+                                   &page_private(b->bm_pages[i])))
+                               continue;
                        if (!(flags & BM_WRITE_ALL_PAGES) &&
                            bm_test_page_unchanged(b->bm_pages[i])) {
                                dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
        else
                kref_put(&ctx->kref, &bm_aio_ctx_destroy);
  
-       dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
-                       rw == WRITE ? "WRITE" : "READ",
-                       count, jiffies - now);
+       /* summary for global bitmap IO */
+       if (flags == 0)
+               dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
+                        rw == WRITE ? "WRITE" : "READ",
+                        count, jiffies - now);
  
        if (ctx->error) {
                dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
        }
        now = b->bm_set;
  
-       dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
-            ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
+       if (flags == 0)
+               dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
+                    ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
  
        kref_put(&ctx->kref, &bm_aio_ctx_destroy);
        return err;
@@@ -1181,9 -1222,17 +1221,17 @@@ int drbd_bm_write_copy_pages(struct drb
        return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0);
  }
  
+ /**
+  * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
+  * @mdev:     DRBD device.
+  */
+ int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local)
+ {
+       return bm_rw(mdev, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
+ }
  
  /**
-  * drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap
+  * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap
   * @mdev:     DRBD device.
   * @idx:      bitmap page index
   *
@@@ -1246,7 -1295,7 +1294,7 @@@ int drbd_bm_write_page(struct drbd_con
   * this returns a bit number, NOT a sector!
   */
  static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
 -      const int find_zero_bit, const enum km_type km)
 +      const int find_zero_bit)
  {
        struct drbd_bitmap *b = mdev->bitmap;
        unsigned long *p_addr;
                while (bm_fo < b->bm_bits) {
                        /* bit offset of the first bit in the page */
                        bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
 -                      p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km);
 +                      p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo));
  
                        if (find_zero_bit)
                                i = find_next_zero_bit_le(p_addr,
                                i = find_next_bit_le(p_addr,
                                                PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
  
 -                      __bm_unmap(p_addr, km);
 +                      __bm_unmap(p_addr);
                        if (i < PAGE_SIZE*8) {
                                bm_fo = bit_offset + i;
                                if (bm_fo >= b->bm_bits)
@@@ -1291,14 -1340,16 +1339,16 @@@ static unsigned long bm_find_next(struc
        struct drbd_bitmap *b = mdev->bitmap;
        unsigned long i = DRBD_END_OF_BITMAP;
  
-       ERR_IF(!b) return i;
-       ERR_IF(!b->bm_pages) return i;
+       if (!expect(b))
+               return i;
+       if (!expect(b->bm_pages))
+               return i;
  
        spin_lock_irq(&b->bm_lock);
        if (BM_DONT_TEST & b->bm_flags)
                bm_print_lock_info(mdev);
  
 -      i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
 +      i = __bm_find_next(mdev, bm_fo, find_zero_bit);
  
        spin_unlock_irq(&b->bm_lock);
        return i;
@@@ -1322,13 -1373,13 +1372,13 @@@ unsigned long drbd_bm_find_next_zero(st
  unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
  {
        /* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
 -      return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
 +      return __bm_find_next(mdev, bm_fo, 0);
  }
  
  unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
  {
        /* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
 -      return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
 +      return __bm_find_next(mdev, bm_fo, 1);
  }
  
  /* returns number of bits actually changed.
@@@ -1356,14 -1407,14 +1406,14 @@@ static int __bm_change_bits_to(struct d
                unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
                if (page_nr != last_page_nr) {
                        if (p_addr)
 -                              __bm_unmap(p_addr, KM_IRQ1);
 +                              __bm_unmap(p_addr);
                        if (c < 0)
                                bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
                        else if (c > 0)
                                bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
                        changed_total += c;
                        c = 0;
 -                      p_addr = __bm_map_pidx(b, page_nr, KM_IRQ1);
 +                      p_addr = __bm_map_pidx(b, page_nr);
                        last_page_nr = page_nr;
                }
                if (val)
                        c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
        }
        if (p_addr)
 -              __bm_unmap(p_addr, KM_IRQ1);
 +              __bm_unmap(p_addr);
        if (c < 0)
                bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
        else if (c > 0)
@@@ -1393,8 -1444,10 +1443,10 @@@ static int bm_change_bits_to(struct drb
        struct drbd_bitmap *b = mdev->bitmap;
        int c = 0;
  
-       ERR_IF(!b) return 1;
-       ERR_IF(!b->bm_pages) return 0;
+       if (!expect(b))
+               return 1;
+       if (!expect(b->bm_pages))
+               return 0;
  
        spin_lock_irqsave(&b->bm_lock, flags);
        if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
@@@ -1425,13 -1478,21 +1477,21 @@@ static inline void bm_set_full_words_wi
  {
        int i;
        int bits;
 -      unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_IRQ1);
+       int changed = 0;
 +      unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
        for (i = first_word; i < last_word; i++) {
                bits = hweight_long(paddr[i]);
                paddr[i] = ~0UL;
-               b->bm_set += BITS_PER_LONG - bits;
+               changed += BITS_PER_LONG - bits;
        }
 -      kunmap_atomic(paddr, KM_IRQ1);
 +      kunmap_atomic(paddr);
+       if (changed) {
+               /* We only need lazy writeout, the information is still in the
+                * remote bitmap as well, and is reconstructed during the next
+                * bitmap exchange, if lost locally due to a crash. */
+               bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
+               b->bm_set += changed;
+       }
  }
  
  /* Same thing as drbd_bm_set_bits,
@@@ -1526,8 -1587,10 +1586,10 @@@ int drbd_bm_test_bit(struct drbd_conf *
        unsigned long *p_addr;
        int i;
  
-       ERR_IF(!b) return 0;
-       ERR_IF(!b->bm_pages) return 0;
+       if (!expect(b))
+               return 0;
+       if (!expect(b->bm_pages))
+               return 0;
  
        spin_lock_irqsave(&b->bm_lock, flags);
        if (BM_DONT_TEST & b->bm_flags)
@@@ -1561,8 -1624,10 +1623,10 @@@ int drbd_bm_count_bits(struct drbd_con
         * robust in case we screwed up elsewhere, in that case pretend there
         * was one dirty bit in the requested area, so we won't try to do a
         * local read there (no bitmap probably implies no disk) */
-       ERR_IF(!b) return 1;
-       ERR_IF(!b->bm_pages) return 1;
+       if (!expect(b))
+               return 1;
+       if (!expect(b->bm_pages))
+               return 1;
  
        spin_lock_irqsave(&b->bm_lock, flags);
        if (BM_DONT_TEST & b->bm_flags)
                                bm_unmap(p_addr);
                        p_addr = bm_map_pidx(b, idx);
                }
-               ERR_IF (bitnr >= b->bm_bits) {
-                       dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
-               } else {
+               if (expect(bitnr < b->bm_bits))
                        c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
-               }
+               else
+                       dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
        }
        if (p_addr)
                bm_unmap(p_addr);
@@@ -1609,8 -1673,10 +1672,10 @@@ int drbd_bm_e_weight(struct drbd_conf *
        unsigned long flags;
        unsigned long *p_addr, *bm;
  
-       ERR_IF(!b) return 0;
-       ERR_IF(!b->bm_pages) return 0;
+       if (!expect(b))
+               return 0;
+       if (!expect(b->bm_pages))
+               return 0;
  
        spin_lock_irqsave(&b->bm_lock, flags);
        if (BM_DONT_TEST & b->bm_flags)
        spin_unlock_irqrestore(&b->bm_lock, flags);
        return count;
  }
- /* Set all bits covered by the AL-extent al_enr.
-  * Returns number of bits changed. */
- unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
- {
-       struct drbd_bitmap *b = mdev->bitmap;
-       unsigned long *p_addr, *bm;
-       unsigned long weight;
-       unsigned long s, e;
-       int count, i, do_now;
-       ERR_IF(!b) return 0;
-       ERR_IF(!b->bm_pages) return 0;
-       spin_lock_irq(&b->bm_lock);
-       if (BM_DONT_SET & b->bm_flags)
-               bm_print_lock_info(mdev);
-       weight = b->bm_set;
-       s = al_enr * BM_WORDS_PER_AL_EXT;
-       e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
-       /* assert that s and e are on the same page */
-       D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
-             ==  s    >> (PAGE_SHIFT - LN2_BPL + 3));
-       count = 0;
-       if (s < b->bm_words) {
-               i = do_now = e-s;
-               p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
-               bm = p_addr + MLPP(s);
-               while (i--) {
-                       count += hweight_long(*bm);
-                       *bm = -1UL;
-                       bm++;
-               }
-               bm_unmap(p_addr);
-               b->bm_set += do_now*BITS_PER_LONG - count;
-               if (e == b->bm_words)
-                       b->bm_set -= bm_clear_surplus(b);
-       } else {
-               dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s);
-       }
-       weight = b->bm_set - weight;
-       spin_unlock_irq(&b->bm_lock);
-       return weight;
- }
index 277c69c9465b946b9a987e195f4823223710c558,784f4eb2ed61c2a13602045ff6e8957002bcf93c..ef72a72814c76087e8ec978f8cff82cb5e2afbd2
@@@ -28,6 -28,7 +28,6 @@@
  
  #include <linux/compiler.h>
  #include <linux/types.h>
 -#include <linux/version.h>
  #include <linux/list.h>
  #include <linux/sched.h>
  #include <linux/bitops.h>
  #include <linux/major.h>
  #include <linux/blkdev.h>
  #include <linux/genhd.h>
+ #include <linux/idr.h>
  #include <net/tcp.h>
  #include <linux/lru_cache.h>
  #include <linux/prefetch.h>
+ #include <linux/drbd_genl_api.h>
+ #include <linux/drbd.h>
+ #include "drbd_state.h"
  
  #ifdef __CHECKER__
  # define __protected_by(x)       __attribute__((require_context(x,1,999,"rdwr")))
@@@ -59,9 -64,8 +63,8 @@@
  
  /* module parameter, defined in drbd_main.c */
  extern unsigned int minor_count;
 -extern int disable_sendpage;
 -extern int allow_oos;
 +extern bool disable_sendpage;
 +extern bool allow_oos;
- extern unsigned int cn_idx;
  
  #ifdef CONFIG_DRBD_FAULT_INJECTION
  extern int enable_faults;
@@@ -86,34 -90,44 +89,44 @@@ extern char usermode_helper[]
   */
  #define DRBD_SIGKILL SIGHUP
  
- /* All EEs on the free list should have ID_VACANT (== 0)
-  * freshly allocated EEs get !ID_VACANT (== 1)
-  * so if it says "cannot dereference null pointer at address 0x00000001",
-  * it is most likely one of these :( */
  #define ID_IN_SYNC      (4711ULL)
  #define ID_OUT_OF_SYNC  (4712ULL)
  #define ID_SYNCER (-1ULL)
- #define ID_VACANT 0
- #define is_syncer_block_id(id) ((id) == ID_SYNCER)
  #define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL)
  
  struct drbd_conf;
+ struct drbd_tconn;
  
  
  /* to shorten dev_warn(DEV, "msg"); and relatives statements */
  #define DEV (disk_to_dev(mdev->vdisk))
  
+ #define conn_printk(LEVEL, TCONN, FMT, ARGS...) \
+       printk(LEVEL "d-con %s: " FMT, TCONN->name , ## ARGS)
+ #define conn_alert(TCONN, FMT, ARGS...)  conn_printk(KERN_ALERT, TCONN, FMT, ## ARGS)
+ #define conn_crit(TCONN, FMT, ARGS...)   conn_printk(KERN_CRIT, TCONN, FMT, ## ARGS)
+ #define conn_err(TCONN, FMT, ARGS...)    conn_printk(KERN_ERR, TCONN, FMT, ## ARGS)
+ #define conn_warn(TCONN, FMT, ARGS...)   conn_printk(KERN_WARNING, TCONN, FMT, ## ARGS)
+ #define conn_notice(TCONN, FMT, ARGS...) conn_printk(KERN_NOTICE, TCONN, FMT, ## ARGS)
+ #define conn_info(TCONN, FMT, ARGS...)   conn_printk(KERN_INFO, TCONN, FMT, ## ARGS)
+ #define conn_dbg(TCONN, FMT, ARGS...)    conn_printk(KERN_DEBUG, TCONN, FMT, ## ARGS)
  #define D_ASSERT(exp) if (!(exp)) \
         dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
  
- #define ERR_IF(exp) if (({                                            \
-       int _b = (exp) != 0;                                            \
-       if (_b) dev_err(DEV, "ASSERT FAILED: %s: (%s) in %s:%d\n",      \
-                       __func__, #exp, __FILE__, __LINE__);            \
-       _b;                                                             \
-       }))
+ /**
+  * expect  -  Make an assertion
+  *
+  * Unlike the assert macro, this macro returns a boolean result.
+  */
+ #define expect(exp) ({                                                                \
+               bool _bool = (exp);                                             \
+               if (!_bool)                                                     \
+                       dev_err(DEV, "ASSERTION %s FAILED in %s\n",             \
+                               #exp, __func__);                                \
+               _bool;                                                          \
+               })
  
  /* Defines to control fault insertion */
  enum {
@@@ -150,15 -164,12 +163,12 @@@ drbd_insert_fault(struct drbd_conf *mde
  /* usual integer division */
  #define div_floor(A, B) ((A)/(B))
  
- /* drbd_meta-data.c (still in drbd_main.c) */
- /* 4th incarnation of the disk layout. */
- #define DRBD_MD_MAGIC (DRBD_MAGIC+4)
- extern struct drbd_conf **minor_table;
  extern struct ratelimit_state drbd_ratelimit_state;
+ extern struct idr minors; /* RCU, updates: genl_lock() */
+ extern struct list_head drbd_tconns; /* RCU, updates: genl_lock() */
  
  /* on the wire */
- enum drbd_packets {
+ enum drbd_packet {
        /* receiver (data socket) */
        P_DATA                = 0x00,
        P_DATA_REPLY          = 0x01, /* Response to P_DATA_REQUEST */
        P_RECV_ACK            = 0x15, /* Used in protocol B */
        P_WRITE_ACK           = 0x16, /* Used in protocol C */
        P_RS_WRITE_ACK        = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
-       P_DISCARD_ACK         = 0x18, /* Used in proto C, two-primaries conflict detection */
+       P_SUPERSEDED          = 0x18, /* Used in proto C, two-primaries conflict detection */
        P_NEG_ACK             = 0x19, /* Sent if local disk is unusable */
        P_NEG_DREPLY          = 0x1a, /* Local disk is broken... */
        P_NEG_RS_DREPLY       = 0x1b, /* Local disk is broken... */
        P_DELAY_PROBE         = 0x27, /* is used on BOTH sockets */
        P_OUT_OF_SYNC         = 0x28, /* Mark as out of sync (Outrunning), data socket */
        P_RS_CANCEL           = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
+       P_CONN_ST_CHG_REQ     = 0x2a, /* data sock: Connection wide state request */
+       P_CONN_ST_CHG_REPLY   = 0x2b, /* meta sock: Connection side state req reply */
+       P_RETRY_WRITE         = 0x2c, /* Protocol C: retry conflicting write request */
+       P_PROTOCOL_UPDATE     = 0x2d, /* data sock: is used in established connections */
  
-       P_MAX_CMD             = 0x2A,
        P_MAY_IGNORE          = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
        P_MAX_OPT_CMD         = 0x101,
  
        /* special command ids for handshake */
  
-       P_HAND_SHAKE_M        = 0xfff1, /* First Packet on the MetaSock */
-       P_HAND_SHAKE_S        = 0xfff2, /* First Packet on the Socket */
+       P_INITIAL_META        = 0xfff1, /* First Packet on the MetaSock */
+       P_INITIAL_DATA        = 0xfff2, /* First Packet on the Socket */
  
-       P_HAND_SHAKE          = 0xfffe  /* FIXED for the next century! */
+       P_CONNECTION_FEATURES = 0xfffe  /* FIXED for the next century! */
  };
  
- static inline const char *cmdname(enum drbd_packets cmd)
- {
-       /* THINK may need to become several global tables
-        * when we want to support more than
-        * one PRO_VERSION */
-       static const char *cmdnames[] = {
-               [P_DATA]                = "Data",
-               [P_DATA_REPLY]          = "DataReply",
-               [P_RS_DATA_REPLY]       = "RSDataReply",
-               [P_BARRIER]             = "Barrier",
-               [P_BITMAP]              = "ReportBitMap",
-               [P_BECOME_SYNC_TARGET]  = "BecomeSyncTarget",
-               [P_BECOME_SYNC_SOURCE]  = "BecomeSyncSource",
-               [P_UNPLUG_REMOTE]       = "UnplugRemote",
-               [P_DATA_REQUEST]        = "DataRequest",
-               [P_RS_DATA_REQUEST]     = "RSDataRequest",
-               [P_SYNC_PARAM]          = "SyncParam",
-               [P_SYNC_PARAM89]        = "SyncParam89",
-               [P_PROTOCOL]            = "ReportProtocol",
-               [P_UUIDS]               = "ReportUUIDs",
-               [P_SIZES]               = "ReportSizes",
-               [P_STATE]               = "ReportState",
-               [P_SYNC_UUID]           = "ReportSyncUUID",
-               [P_AUTH_CHALLENGE]      = "AuthChallenge",
-               [P_AUTH_RESPONSE]       = "AuthResponse",
-               [P_PING]                = "Ping",
-               [P_PING_ACK]            = "PingAck",
-               [P_RECV_ACK]            = "RecvAck",
-               [P_WRITE_ACK]           = "WriteAck",
-               [P_RS_WRITE_ACK]        = "RSWriteAck",
-               [P_DISCARD_ACK]         = "DiscardAck",
-               [P_NEG_ACK]             = "NegAck",
-               [P_NEG_DREPLY]          = "NegDReply",
-               [P_NEG_RS_DREPLY]       = "NegRSDReply",
-               [P_BARRIER_ACK]         = "BarrierAck",
-               [P_STATE_CHG_REQ]       = "StateChgRequest",
-               [P_STATE_CHG_REPLY]     = "StateChgReply",
-               [P_OV_REQUEST]          = "OVRequest",
-               [P_OV_REPLY]            = "OVReply",
-               [P_OV_RESULT]           = "OVResult",
-               [P_CSUM_RS_REQUEST]     = "CsumRSRequest",
-               [P_RS_IS_IN_SYNC]       = "CsumRSIsInSync",
-               [P_COMPRESSED_BITMAP]   = "CBitmap",
-               [P_DELAY_PROBE]         = "DelayProbe",
-               [P_OUT_OF_SYNC]         = "OutOfSync",
-               [P_MAX_CMD]             = NULL,
-       };
-       if (cmd == P_HAND_SHAKE_M)
-               return "HandShakeM";
-       if (cmd == P_HAND_SHAKE_S)
-               return "HandShakeS";
-       if (cmd == P_HAND_SHAKE)
-               return "HandShake";
-       if (cmd >= P_MAX_CMD)
-               return "Unknown";
-       return cmdnames[cmd];
- }
+ extern const char *cmdname(enum drbd_packet cmd);
  
  /* for sending/receiving the bitmap,
   * possibly in some encoding scheme */
@@@ -337,37 -294,24 +293,24 @@@ struct p_header80 
        u32       magic;
        u16       command;
        u16       length;       /* bytes of data after this header */
-       u8        payload[0];
  } __packed;
  
  /* Header for big packets, Used for data packets exceeding 64kB */
  struct p_header95 {
        u16       magic;        /* use DRBD_MAGIC_BIG here */
        u16       command;
-       u32       length;       /* Use only 24 bits of that. Ignore the highest 8 bit. */
-       u8        payload[0];
+       u32       length;
  } __packed;
  
- union p_header {
-       struct p_header80 h80;
-       struct p_header95 h95;
- };
- /*
-  * short commands, packets without payload, plain p_header:
-  *   P_PING
-  *   P_PING_ACK
-  *   P_BECOME_SYNC_TARGET
-  *   P_BECOME_SYNC_SOURCE
-  *   P_UNPLUG_REMOTE
-  */
+ struct p_header100 {
+       u32       magic;
+       u16       volume;
+       u16       command;
+       u32       length;
+       u32       pad;
+ } __packed;
  
- /*
-  * commands with out-of-struct payload:
-  *   P_BITMAP    (no additional fields)
-  *   P_DATA, P_DATA_REPLY (see p_data)
-  *   P_COMPRESSED_BITMAP (see receive_compressed_bitmap)
-  */
+ extern unsigned int drbd_header_size(struct drbd_tconn *tconn);
  
  /* these defines must not be changed without changing the protocol version */
  #define DP_HARDBARRIER              1 /* depricated */
  #define DP_FUA               16 /* equals REQ_FUA     */
  #define DP_FLUSH             32 /* equals REQ_FLUSH   */
  #define DP_DISCARD           64 /* equals REQ_DISCARD */
+ #define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
+ #define DP_SEND_WRITE_ACK   256 /* This is a proto C write request */
  
  struct p_data {
-       union p_header head;
        u64         sector;    /* 64 bits sector number */
        u64         block_id;  /* to identify the request in protocol B&C */
        u32         seq_num;
   * commands which share a struct:
   *  p_block_ack:
   *   P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
-  *   P_DISCARD_ACK (proto C, two-primaries conflict detection)
+  *   P_SUPERSEDED (proto C, two-primaries conflict detection)
   *  p_block_req:
   *   P_DATA_REQUEST, P_RS_DATA_REQUEST
   */
  struct p_block_ack {
-       struct p_header80 head;
        u64         sector;
        u64         block_id;
        u32         blksize;
        u32         seq_num;
  } __packed;
  
  struct p_block_req {
-       struct p_header80 head;
        u64 sector;
        u64 block_id;
        u32 blksize;
  
  /*
   * commands with their own struct for additional fields:
-  *   P_HAND_SHAKE
+  *   P_CONNECTION_FEATURES
   *   P_BARRIER
   *   P_BARRIER_ACK
   *   P_SYNC_PARAM
   *   ReportParams
   */
  
- struct p_handshake {
-       struct p_header80 head; /* 8 bytes */
+ struct p_connection_features {
        u32 protocol_min;
        u32 feature_flags;
        u32 protocol_max;
  
        /* should be more than enough for future enhancements
-        * for now, feature_flags and the reserverd array shall be zero.
+        * for now, feature_flags and the reserved array shall be zero.
         */
  
        u32 _pad;
-       u64 reserverd[7];
+       u64 reserved[7];
  } __packed;
- /* 80 bytes, FIXED for the next century */
  
  struct p_barrier {
-       struct p_header80 head;
        u32 barrier;    /* barrier number _handle_ only */
        u32 pad;        /* to multiple of 8 Byte */
  } __packed;
  
  struct p_barrier_ack {
-       struct p_header80 head;
        u32 barrier;
        u32 set_size;
  } __packed;
  
  struct p_rs_param {
-       struct p_header80 head;
-       u32 rate;
+       u32 resync_rate;
  
              /* Since protocol version 88 and higher. */
        char verify_alg[0];
  } __packed;
  
  struct p_rs_param_89 {
-       struct p_header80 head;
-       u32 rate;
+       u32 resync_rate;
          /* protocol version 89: */
        char verify_alg[SHARED_SECRET_MAX];
        char csums_alg[SHARED_SECRET_MAX];
  } __packed;
  
  struct p_rs_param_95 {
-       struct p_header80 head;
-       u32 rate;
+       u32 resync_rate;
        char verify_alg[SHARED_SECRET_MAX];
        char csums_alg[SHARED_SECRET_MAX];
        u32 c_plan_ahead;
  } __packed;
  
  enum drbd_conn_flags {
-       CF_WANT_LOSE = 1,
+       CF_DISCARD_MY_DATA = 1,
        CF_DRY_RUN = 2,
  };
  
  struct p_protocol {
-       struct p_header80 head;
        u32 protocol;
        u32 after_sb_0p;
        u32 after_sb_1p;
  } __packed;
  
  struct p_uuids {
-       struct p_header80 head;
        u64 uuid[UI_EXTENDED_SIZE];
  } __packed;
  
  struct p_rs_uuid {
-       struct p_header80 head;
        u64         uuid;
  } __packed;
  
  struct p_sizes {
-       struct p_header80 head;
        u64         d_size;  /* size of disk */
        u64         u_size;  /* user requested size */
        u64         c_size;  /* current exported size */
  } __packed;
  
  struct p_state {
-       struct p_header80 head;
        u32         state;
  } __packed;
  
  struct p_req_state {
-       struct p_header80 head;
        u32         mask;
        u32         val;
  } __packed;
  
  struct p_req_state_reply {
-       struct p_header80 head;
        u32         retcode;
  } __packed;
  
@@@ -539,15 -467,7 +466,7 @@@ struct p_drbd06_param 
        u32       bit_map_gen[5];
  } __packed;
  
- struct p_discard {
-       struct p_header80 head;
-       u64         block_id;
-       u32         seq_num;
-       u32         pad;
- } __packed;
  struct p_block_desc {
-       struct p_header80 head;
        u64 sector;
        u32 blksize;
        u32 pad;        /* to multiple of 8 Byte */
@@@ -563,7 -483,6 +482,6 @@@ enum drbd_bitmap_code 
  };
  
  struct p_compressed_bm {
-       struct p_header80 head;
        /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
         * (encoding & 0x80): polarity (set/unset) of first runlength
         * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
  } __packed;
  
  struct p_delay_probe93 {
-       struct p_header80 head;
        u32     seq_num; /* sequence number to match the two probe packets */
        u32     offset;  /* usecs the probe got sent after the reference time point */
  } __packed;
  
- /* DCBP: Drbd Compressed Bitmap Packet ... */
- static inline enum drbd_bitmap_code
- DCBP_get_code(struct p_compressed_bm *p)
- {
-       return (enum drbd_bitmap_code)(p->encoding & 0x0f);
- }
- static inline void
- DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
- {
-       BUG_ON(code & ~0xf);
-       p->encoding = (p->encoding & ~0xf) | code;
- }
- static inline int
- DCBP_get_start(struct p_compressed_bm *p)
- {
-       return (p->encoding & 0x80) != 0;
- }
- static inline void
- DCBP_set_start(struct p_compressed_bm *p, int set)
- {
-       p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
- }
- static inline int
- DCBP_get_pad_bits(struct p_compressed_bm *p)
- {
-       return (p->encoding >> 4) & 0x7;
- }
- static inline void
- DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
- {
-       BUG_ON(n & ~0x7);
-       p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
- }
- /* one bitmap packet, including the p_header,
-  * should fit within one _architecture independend_ page.
-  * so we need to use the fixed size 4KiB page size
-  * most architectures have used for a long time.
+ /*
+  * Bitmap packets need to fit within a single page on the sender and receiver,
+  * so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger).
   */
- #define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80))
- #define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
- #define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
- #if (PAGE_SIZE < 4096)
- /* drbd_send_bitmap / receive_bitmap would break horribly */
- #error "PAGE_SIZE too small"
- #endif
- union p_polymorph {
-         union p_header           header;
-         struct p_handshake       handshake;
-         struct p_data            data;
-         struct p_block_ack       block_ack;
-         struct p_barrier         barrier;
-         struct p_barrier_ack     barrier_ack;
-         struct p_rs_param_89     rs_param_89;
-         struct p_rs_param_95     rs_param_95;
-         struct p_protocol        protocol;
-         struct p_sizes           sizes;
-         struct p_uuids           uuids;
-         struct p_state           state;
-         struct p_req_state       req_state;
-         struct p_req_state_reply req_state_reply;
-         struct p_block_req       block_req;
-       struct p_delay_probe93   delay_probe93;
-       struct p_rs_uuid         rs_uuid;
-       struct p_block_desc      block_desc;
- } __packed;
+ #define DRBD_SOCKET_BUFFER_SIZE 4096
  
  /**********************************************************************/
  enum drbd_thread_state {
-       None,
-       Running,
-       Exiting,
-       Restarting
+       NONE,
+       RUNNING,
+       EXITING,
+       RESTARTING
  };
  
  struct drbd_thread {
        struct completion stop;
        enum drbd_thread_state t_state;
        int (*function) (struct drbd_thread *);
-       struct drbd_conf *mdev;
+       struct drbd_tconn *tconn;
        int reset_cpu_mask;
+       char name[9];
  };
  
  static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
        return thi->t_state;
  }
  
- struct drbd_work;
- typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
  struct drbd_work {
        struct list_head list;
-       drbd_work_cb cb;
+       int (*cb)(struct drbd_work *, int cancel);
+       union {
+               struct drbd_conf *mdev;
+               struct drbd_tconn *tconn;
+       };
  };
  
- struct drbd_tl_epoch;
+ #include "drbd_interval.h"
+ extern int drbd_wait_misc(struct drbd_conf *, struct drbd_interval *);
  struct drbd_request {
        struct drbd_work w;
-       struct drbd_conf *mdev;
  
        /* if local IO is not allowed, will be NULL.
         * if local IO _is_ allowed, holds the locally submitted bio clone,
         * or, after local IO completion, the ERR_PTR(error).
-        * see drbd_endio_pri(). */
+        * see drbd_request_endio(). */
        struct bio *private_bio;
  
-       struct hlist_node collision;
-       sector_t sector;
-       unsigned int size;
-       unsigned int epoch; /* barrier_nr */
+       struct drbd_interval i;
  
-       /* barrier_nr: used to check on "completion" whether this req was in
+       /* epoch: used to check on "completion" whether this req was in
         * the current epoch, and we therefore have to close it,
-        * starting a new epoch...
+        * causing a p_barrier packet to be send, starting a new epoch.
+        *
+        * This corresponds to "barrier" in struct p_barrier[_ack],
+        * and to "barrier_nr" in struct drbd_epoch (and various
+        * comments/function parameters/local variable names).
         */
+       unsigned int epoch;
  
        struct list_head tl_requests; /* ring list in the transfer log */
        struct bio *master_bio;       /* master bio pointer */
-       unsigned long rq_state; /* see comments above _req_mod() */
        unsigned long start_time;
- };
  
- struct drbd_tl_epoch {
-       struct drbd_work w;
-       struct list_head requests; /* requests before */
-       struct drbd_tl_epoch *next; /* pointer to the next barrier */
-       unsigned int br_number;  /* the barriers identifier. */
-       int n_writes;   /* number of requests attached before this barrier */
- };
+       /* once it hits 0, we may complete the master_bio */
+       atomic_t completion_ref;
+       /* once it hits 0, we may destroy this drbd_request object */
+       struct kref kref;
  
- struct drbd_request;
- /* These Tl_epoch_entries may be in one of 6 lists:
-    active_ee .. data packet being written
-    sync_ee   .. syncer block being written
-    done_ee   .. block written, need to send P_WRITE_ACK
-    read_ee   .. [RS]P_DATA_REQUEST being read
- */
+       unsigned rq_state; /* see comments above _req_mod() */
+ };
  
  struct drbd_epoch {
+       struct drbd_tconn *tconn;
        struct list_head list;
        unsigned int barrier_nr;
        atomic_t epoch_size; /* increased on every request added. */
@@@ -762,17 -610,14 +609,14 @@@ struct digest_info 
        void *digest;
  };
  
- struct drbd_epoch_entry {
+ struct drbd_peer_request {
        struct drbd_work w;
-       struct hlist_node collision;
        struct drbd_epoch *epoch; /* for writes */
-       struct drbd_conf *mdev;
        struct page *pages;
        atomic_t pending_bios;
-       unsigned int size;
+       struct drbd_interval i;
        /* see comments on ee flag bits below */
        unsigned long flags;
-       sector_t sector;
        union {
                u64 block_id;
                struct digest_info *digest;
@@@ -793,31 -638,37 +637,37 @@@ enum 
         * we need to resubmit without the barrier flag. */
        __EE_RESUBMITTED,
  
-       /* we may have several bios per epoch entry.
+       /* we may have several bios per peer request.
         * if any of those fail, we set this flag atomically
         * from the endio callback */
        __EE_WAS_ERROR,
  
        /* This ee has a pointer to a digest instead of a block id */
        __EE_HAS_DIGEST,
+       /* Conflicting local requests need to be restarted after this request */
+       __EE_RESTART_REQUESTS,
+       /* The peer wants a write ACK for this (wire proto C) */
+       __EE_SEND_WRITE_ACK,
+       /* Is set when net_conf had two_primaries set while creating this peer_req */
+       __EE_IN_INTERVAL_TREE,
  };
  #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
  #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
  #define       EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
  #define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
  #define EE_HAS_DIGEST          (1<<__EE_HAS_DIGEST)
+ #define EE_RESTART_REQUESTS   (1<<__EE_RESTART_REQUESTS)
+ #define EE_SEND_WRITE_ACK     (1<<__EE_SEND_WRITE_ACK)
+ #define EE_IN_INTERVAL_TREE   (1<<__EE_IN_INTERVAL_TREE)
  
- /* global flag bits */
- enum drbd_flag {
-       CREATE_BARRIER,         /* next P_DATA is preceded by a P_BARRIER */
-       SIGNAL_ASENDER,         /* whether asender wants to be interrupted */
-       SEND_PING,              /* whether asender should send a ping asap */
+ /* flag bits per mdev */
+ enum {
        UNPLUG_REMOTE,          /* sending a "UnplugRemote" could help */
        MD_DIRTY,               /* current uuids and flags not yet on disk */
-       DISCARD_CONCURRENT,     /* Set on one node, cleared on the peer! */
        USE_DEGR_WFC_T,         /* degr-wfc-timeout instead of wfc-timeout. */
-       CLUSTER_ST_CHANGE,      /* Cluster wide state change going on... */
        CL_ST_CHG_SUCCESS,
        CL_ST_CHG_FAIL,
        CRASHED_PRIMARY,        /* This node was a crashed primary.
        WAS_READ_ERROR,         /* Local disk READ failed (set additionally to the above) */
        FORCE_DETACH,           /* Force-detach from local disk, aborting any pending local IO */
        RESYNC_AFTER_NEG,       /* Resync after online grow after the attach&negotiate finished. */
-       NET_CONGESTED,          /* The data socket is congested */
-       CONFIG_PENDING,         /* serialization of (re)configuration requests.
-                                * if set, also prevents the device from dying */
-       DEVICE_DYING,           /* device became unconfigured,
-                                * but worker thread is still handling the cleanup.
-                                * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed,
-                                * while this is set. */
        RESIZE_PENDING,         /* Size change detected locally, waiting for the response from
                                 * the peer, if it changed there as well. */
-       CONN_DRY_RUN,           /* Expect disconnect after resync handshake. */
-       GOT_PING_ACK,           /* set when we receive a ping_ack packet, misc wait gets woken */
        NEW_CUR_UUID,           /* Create new current UUID when thawing IO */
        AL_SUSPENDED,           /* Activity logging is currently suspended. */
        AHEAD_TO_SYNC_SOURCE,   /* Ahead -> SyncSource queued */
-       STATE_SENT,             /* Do not change state/UUIDs while this is set */
-       CALLBACK_PENDING,       /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
-                                * pending, from drbd worker context.
-                                * If set, bdi_write_congested() returns true,
-                                * so shrink_page_list() would not recurse into,
-                                * and potentially deadlock on, this drbd worker.
-                                */
-       DISCONNECT_SENT,        /* Currently the last bit in this 32bit word */
-       /* keep last */
-       DRBD_N_FLAGS,
+       B_RS_H_DONE,            /* Before resync handler done (already executed) */
+       DISCARD_MY_DATA,        /* discard_my_data flag per volume */
+       READ_BALANCE_RR,
  };
  
  struct drbd_bitmap; /* opaque for drbd_conf */
@@@ -899,18 -731,17 +730,17 @@@ enum bm_flag 
  
  struct drbd_work_queue {
        struct list_head q;
-       struct semaphore s; /* producers up it, worker down()s it */
        spinlock_t q_lock;  /* to protect the list. */
+       wait_queue_head_t q_wait;
  };
  
  struct drbd_socket {
-       struct drbd_work_queue work;
        struct mutex mutex;
        struct socket    *socket;
        /* this way we get our
         * send/receive buffers off the stack */
-       union p_polymorph sbuf;
-       union p_polymorph rbuf;
+       void *sbuf;
+       void *rbuf;
  };
  
  struct drbd_md {
        s32 bm_offset;  /* signed relative sector offset to bitmap */
  
        /* u32 al_nr_extents;      important for restoring the AL
-        * is stored into  sync_conf.al_extents, which in turn
+        * is stored into  ldev->dc.al_extents, which in turn
         * gets applied to act_log->nr_elements
         */
  };
  
- /* for sync_conf and other types... */
- #define NL_PACKET(name, number, fields) struct name { fields };
- #define NL_INTEGER(pn,pr,member) int member;
- #define NL_INT64(pn,pr,member) __u64 member;
- #define NL_BIT(pn,pr,member)   unsigned member:1;
- #define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
- #include <linux/drbd_nl.h>
  struct drbd_backing_dev {
        struct block_device *backing_bdev;
        struct block_device *md_bdev;
        struct drbd_md md;
-       struct disk_conf dc; /* The user provided config... */
+       struct disk_conf *disk_conf; /* RCU, for updates: mdev->tconn->conf_update */
        sector_t known_size; /* last known size of that backing device */
  };
  
@@@ -968,17 -791,116 +790,116 @@@ enum write_ordering_e 
  };
  
  struct fifo_buffer {
-       int *values;
        unsigned int head_index;
        unsigned int size;
+       int total; /* sum of all values */
+       int values[0];
+ };
+ extern struct fifo_buffer *fifo_alloc(int fifo_size);
+ /* flag bits per tconn */
+ enum {
+       NET_CONGESTED,          /* The data socket is congested */
+       RESOLVE_CONFLICTS,      /* Set on one node, cleared on the peer! */
+       SEND_PING,              /* whether asender should send a ping asap */
+       SIGNAL_ASENDER,         /* whether asender wants to be interrupted */
+       GOT_PING_ACK,           /* set when we receive a ping_ack packet, ping_wait gets woken */
+       CONN_WD_ST_CHG_REQ,     /* A cluster wide state change on the connection is active */
+       CONN_WD_ST_CHG_OKAY,
+       CONN_WD_ST_CHG_FAIL,
+       CONN_DRY_RUN,           /* Expect disconnect after resync handshake. */
+       CREATE_BARRIER,         /* next P_DATA is preceded by a P_BARRIER */
+       STATE_SENT,             /* Do not change state/UUIDs while this is set */
+       CALLBACK_PENDING,       /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
+                                * pending, from drbd worker context.
+                                * If set, bdi_write_congested() returns true,
+                                * so shrink_page_list() would not recurse into,
+                                * and potentially deadlock on, this drbd worker.
+                                */
+       DISCONNECT_SENT,
+ };
+ struct drbd_tconn {                   /* is a resource from the config file */
+       char *name;                     /* Resource name */
+       struct list_head all_tconn;     /* linked on global drbd_tconns */
+       struct kref kref;
+       struct idr volumes;             /* <tconn, vnr> to mdev mapping */
+       enum drbd_conns cstate;         /* Only C_STANDALONE to C_WF_REPORT_PARAMS */
+       unsigned susp:1;                /* IO suspended by user */
+       unsigned susp_nod:1;            /* IO suspended because no data */
+       unsigned susp_fen:1;            /* IO suspended because fence peer handler runs */
+       struct mutex cstate_mutex;      /* Protects graceful disconnects */
+       unsigned long flags;
+       struct net_conf *net_conf;      /* content protected by rcu */
+       struct mutex conf_update;       /* mutex for ready-copy-update of net_conf and disk_conf */
+       wait_queue_head_t ping_wait;    /* Woken upon reception of a ping, and a state change */
+       struct res_opts res_opts;
+       struct sockaddr_storage my_addr;
+       int my_addr_len;
+       struct sockaddr_storage peer_addr;
+       int peer_addr_len;
+       struct drbd_socket data;        /* data/barrier/cstate/parameter packets */
+       struct drbd_socket meta;        /* ping/ack (metadata) packets */
+       int agreed_pro_version;         /* actually used protocol version */
+       unsigned long last_received;    /* in jiffies, either socket */
+       unsigned int ko_count;
+       spinlock_t req_lock;
+       struct list_head transfer_log;  /* all requests not yet fully processed */
+       struct crypto_hash *cram_hmac_tfm;
+       struct crypto_hash *integrity_tfm;  /* checksums we compute, updates protected by tconn->data->mutex */
+       struct crypto_hash *peer_integrity_tfm;  /* checksums we verify, only accessed from receiver thread  */
+       struct crypto_hash *csums_tfm;
+       struct crypto_hash *verify_tfm;
+       void *int_dig_in;
+       void *int_dig_vv;
+       /* receiver side */
+       struct drbd_epoch *current_epoch;
+       spinlock_t epoch_lock;
+       unsigned int epochs;
+       enum write_ordering_e write_ordering;
+       atomic_t current_tle_nr;        /* transfer log epoch number */
+       unsigned current_tle_writes;    /* writes seen within this tl epoch */
+       unsigned long last_reconnect_jif;
+       struct drbd_thread receiver;
+       struct drbd_thread worker;
+       struct drbd_thread asender;
+       cpumask_var_t cpu_mask;
+       /* sender side */
+       struct drbd_work_queue sender_work;
+       struct {
+               /* whether this sender thread
+                * has processed a single write yet. */
+               bool seen_any_write_yet;
+               /* Which barrier number to send with the next P_BARRIER */
+               int current_epoch_nr;
+               /* how many write requests have been sent
+                * with req->epoch == current_epoch_nr.
+                * If none, no P_BARRIER will be sent. */
+               unsigned current_epoch_writes;
+       } send;
  };
  
  struct drbd_conf {
-       unsigned long drbd_flags[(DRBD_N_FLAGS + BITS_PER_LONG -1)/BITS_PER_LONG];
+       struct drbd_tconn *tconn;
+       int vnr;                        /* volume number within the connection */
+       struct kref kref;
+       /* things that are stored as / read from meta data on disk */
+       unsigned long flags;
  
        /* configured by drbdsetup */
-       struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */
-       struct syncer_conf sync_conf;
        struct drbd_backing_dev *ldev __protected_by(local);
  
        sector_t p_size;     /* partner's disk size */
        struct block_device *this_bdev;
        struct gendisk      *vdisk;
  
-       struct drbd_socket data; /* data/barrier/cstate/parameter packets */
-       struct drbd_socket meta; /* ping/ack (metadata) packets */
-       int agreed_pro_version;  /* actually used protocol version */
-       unsigned long last_received; /* in jiffies, either socket */
-       unsigned int ko_count;
+       unsigned long last_reattach_jif;
        struct drbd_work  resync_work,
                          unplug_work,
                          go_diskless,
        /* Used after attach while negotiating new disk state. */
        union drbd_state new_state_tmp;
  
-       union drbd_state state;
+       union drbd_dev_state state;
        wait_queue_head_t misc_wait;
        wait_queue_head_t state_wait;  /* upon each state change. */
-       wait_queue_head_t net_cnt_wait;
        unsigned int send_cnt;
        unsigned int recv_cnt;
        unsigned int read_cnt;
        atomic_t ap_bio_cnt;     /* Requests we need to complete */
        atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
        atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
-       atomic_t unacked_cnt;    /* Need to send replys for */
+       atomic_t unacked_cnt;    /* Need to send replies for */
        atomic_t local_cnt;      /* Waiting for local completion */
-       atomic_t net_cnt;        /* Users of net_conf */
-       spinlock_t req_lock;
-       struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */
-       struct drbd_tl_epoch *newest_tle;
-       struct drbd_tl_epoch *oldest_tle;
-       struct list_head out_of_sequence_requests;
-       struct list_head barrier_acked_requests;
-       struct hlist_head *tl_hash;
-       unsigned int tl_hash_s;
+       /* Interval tree of pending local requests */
+       struct rb_root read_requests;
+       struct rb_root write_requests;
  
        /* blocks to resync in this run [unit BM_BLOCK_SIZE] */
        unsigned long rs_total;
        unsigned long rs_mark_time[DRBD_SYNC_MARKS];
        /* current index into rs_mark_{left,time} */
        int rs_last_mark;
+       unsigned long rs_last_bcast; /* [unit jiffies] */
  
        /* where does the admin want us to start? (sector) */
        sector_t ov_start_sector;
        /* size of out-of-sync range in sectors. */
        sector_t ov_last_oos_size;
        unsigned long ov_left; /* in bits */
-       struct crypto_hash *csums_tfm;
-       struct crypto_hash *verify_tfm;
  
-       unsigned long last_reattach_jif;
-       unsigned long last_reconnect_jif;
-       struct drbd_thread receiver;
-       struct drbd_thread worker;
-       struct drbd_thread asender;
        struct drbd_bitmap *bitmap;
        unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
  
  
        int open_cnt;
        u64 *p_uuid;
-       struct drbd_epoch *current_epoch;
-       spinlock_t epoch_lock;
-       unsigned int epochs;
-       enum write_ordering_e write_ordering;
        struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
        struct list_head sync_ee;   /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
-       struct list_head done_ee;   /* send ack */
-       struct list_head read_ee;   /* IO in progress (any read) */
+       struct list_head done_ee;   /* need to send P_WRITE_ACK */
+       struct list_head read_ee;   /* [RS]P_DATA_REQUEST being read */
        struct list_head net_ee;    /* zero-copy network send in progress */
-       struct hlist_head *ee_hash; /* is proteced by req_lock! */
-       unsigned int ee_hash_s;
-       /* this one is protected by ee_lock, single thread */
-       struct drbd_epoch_entry *last_write_w_barrier;
  
        int next_barrier_nr;
-       struct hlist_head *app_reads_hash; /* is proteced by req_lock */
        struct list_head resync_reads;
        atomic_t pp_in_use;             /* allocated from page pool */
        atomic_t pp_in_use_by_net;      /* sendpage()d, still referenced by tcp */
        wait_queue_head_t ee_wait;
        struct page *md_io_page;        /* one page buffer for md_io */
-       struct page *md_io_tmpp;        /* for logical_block_size != 512 */
        struct drbd_md_io md_io;
        atomic_t md_io_in_use;          /* protects the md_io, md_io_page and md_io_tmpp */
        spinlock_t al_lock;
        unsigned int al_tr_number;
        int al_tr_cycle;
        int al_tr_pos;   /* position of the next transaction in the journal */
-       struct crypto_hash *cram_hmac_tfm;
-       struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */
-       struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */
-       void *int_dig_out;
-       void *int_dig_in;
-       void *int_dig_vv;
        wait_queue_head_t seq_wait;
        atomic_t packet_seq;
        unsigned int peer_seq;
        spinlock_t peer_seq_lock;
        unsigned int minor;
        unsigned long comm_bm_set; /* communicated number of set bits. */
-       cpumask_var_t cpu_mask;
        struct bm_io_work bm_io_work;
        u64 ed_uuid; /* UUID of the exposed data */
-       struct mutex state_mutex;
+       struct mutex own_state_mutex;
+       struct mutex *state_mutex; /* either own_state_mutex or mdev->tconn->cstate_mutex */
        char congestion_reason;  /* Why we where congested... */
        atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
        atomic_t rs_sect_ev; /* for submitted resync data rate, both */
        int rs_last_events;  /* counter of read or write "events" (unit sectors)
                              * on the lower level device when we last looked. */
        int c_sync_rate; /* current resync rate after syncer throttle magic */
-       struct fifo_buffer rs_plan_s; /* correction values of resync planer */
+       struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, tconn->conn_update) */
        int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
-       int rs_planed;    /* resync sectors already planned */
        atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
 -      int peer_max_bio_size;
 -      int local_max_bio_size;
 +      unsigned int peer_max_bio_size;
 +      unsigned int local_max_bio_size;
  };
  
- static inline void drbd_set_flag(struct drbd_conf *mdev, enum drbd_flag f)
- {
-       set_bit(f, &mdev->drbd_flags[0]);
- }
- static inline void drbd_clear_flag(struct drbd_conf *mdev, enum drbd_flag f)
- {
-       clear_bit(f, &mdev->drbd_flags[0]);
- }
- static inline int drbd_test_flag(struct drbd_conf *mdev, enum drbd_flag f)
- {
-       return test_bit(f, &mdev->drbd_flags[0]);
- }
- static inline int drbd_test_and_set_flag(struct drbd_conf *mdev, enum drbd_flag f)
- {
-       return test_and_set_bit(f, &mdev->drbd_flags[0]);
- }
- static inline int drbd_test_and_clear_flag(struct drbd_conf *mdev, enum drbd_flag f)
- {
-       return test_and_clear_bit(f, &mdev->drbd_flags[0]);
- }
  static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
  {
-       struct drbd_conf *mdev;
-       mdev = minor < minor_count ? minor_table[minor] : NULL;
-       return mdev;
+       return (struct drbd_conf *)idr_find(&minors, minor);
  }
  
  static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
        return mdev->minor;
  }
  
- /* returns 1 if it was successful,
-  * returns 0 if there was no data socket.
-  * so wherever you are going to use the data.socket, e.g. do
-  * if (!drbd_get_data_sock(mdev))
-  *    return 0;
-  *    CODE();
-  * drbd_put_data_sock(mdev);
-  */
- static inline int drbd_get_data_sock(struct drbd_conf *mdev)
- {
-       mutex_lock(&mdev->data.mutex);
-       /* drbd_disconnect() could have called drbd_free_sock()
-        * while we were waiting in down()... */
-       if (unlikely(mdev->data.socket == NULL)) {
-               mutex_unlock(&mdev->data.mutex);
-               return 0;
-       }
-       return 1;
- }
- static inline void drbd_put_data_sock(struct drbd_conf *mdev)
+ static inline struct drbd_conf *vnr_to_mdev(struct drbd_tconn *tconn, int vnr)
  {
-       mutex_unlock(&mdev->data.mutex);
+       return (struct drbd_conf *)idr_find(&tconn->volumes, vnr);
  }
  
  /*
  
  /* drbd_main.c */
  
- enum chg_state_flags {
-       CS_HARD = 1,
-       CS_VERBOSE = 2,
-       CS_WAIT_COMPLETE = 4,
-       CS_SERIALIZE    = 8,
-       CS_ORDERED      = CS_WAIT_COMPLETE + CS_SERIALIZE,
- };
  enum dds_flags {
        DDSF_FORCED    = 1,
        DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
  };
  
  extern void drbd_init_set_defaults(struct drbd_conf *mdev);
- extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev,
-                                           enum chg_state_flags f,
-                                           union drbd_state mask,
-                                           union drbd_state val);
- extern void drbd_force_state(struct drbd_conf *, union drbd_state,
-                       union drbd_state);
- extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *,
-                                             union drbd_state,
-                                             union drbd_state,
-                                             enum chg_state_flags);
- extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state,
-                                          enum chg_state_flags,
-                                          struct completion *done);
- extern void print_st_err(struct drbd_conf *, union drbd_state,
-                       union drbd_state, int);
  extern int  drbd_thread_start(struct drbd_thread *thi);
  extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
+ extern char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task);
  #ifdef CONFIG_SMP
- extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev);
- extern void drbd_calc_cpu_mask(struct drbd_conf *mdev);
+ extern void drbd_thread_current_set_cpu(struct drbd_thread *thi);
+ extern void drbd_calc_cpu_mask(struct drbd_tconn *tconn);
  #else
  #define drbd_thread_current_set_cpu(A) ({})
  #define drbd_calc_cpu_mask(A) ({})
  #endif
- extern void drbd_free_resources(struct drbd_conf *mdev);
- extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
+ extern void tl_release(struct drbd_tconn *, unsigned int barrier_nr,
                       unsigned int set_size);
- extern void tl_clear(struct drbd_conf *mdev);
- extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
- extern void drbd_free_sock(struct drbd_conf *mdev);
- extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
-                       void *buf, size_t size, unsigned msg_flags);
- extern int drbd_send_protocol(struct drbd_conf *mdev);
+ extern void tl_clear(struct drbd_tconn *);
+ extern void drbd_free_sock(struct drbd_tconn *tconn);
+ extern int drbd_send(struct drbd_tconn *tconn, struct socket *sock,
+                    void *buf, size_t size, unsigned msg_flags);
+ extern int drbd_send_all(struct drbd_tconn *, struct socket *, void *, size_t,
+                        unsigned);
+ extern int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd);
+ extern int drbd_send_protocol(struct drbd_tconn *tconn);
  extern int drbd_send_uuids(struct drbd_conf *mdev);
  extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
- extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
+ extern void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
  extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
  extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s);
  extern int drbd_send_current_state(struct drbd_conf *mdev);
- extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
-                       enum drbd_packets cmd, struct p_header80 *h,
-                       size_t size, unsigned msg_flags);
- #define USE_DATA_SOCKET 1
- #define USE_META_SOCKET 0
- extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
-                       enum drbd_packets cmd, struct p_header80 *h,
-                       size_t size);
- extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
-                       char *data, size_t size);
- extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc);
- extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr,
-                       u32 set_size);
- extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
-                       struct drbd_epoch_entry *e);
- extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
-                       struct p_block_req *rp);
- extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
-                       struct p_data *dp, int data_size);
- extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+ extern int drbd_send_sync_param(struct drbd_conf *mdev);
+ extern void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr,
+                           u32 set_size);
+ extern int drbd_send_ack(struct drbd_conf *, enum drbd_packet,
+                        struct drbd_peer_request *);
+ extern void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd,
+                            struct p_block_req *rp);
+ extern void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd,
+                            struct p_data *dp, int data_size);
+ extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd,
                            sector_t sector, int blksize, u64 block_id);
- extern int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req);
- extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
-                          struct drbd_epoch_entry *e);
+ extern int drbd_send_out_of_sync(struct drbd_conf *, struct drbd_request *);
+ extern int drbd_send_block(struct drbd_conf *, enum drbd_packet,
+                          struct drbd_peer_request *);
  extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
  extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
                              sector_t sector, int size, u64 block_id);
- extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
-                                  sector_t sector,int size,
-                                  void *digest, int digest_size,
-                                  enum drbd_packets cmd);
+ extern int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector,
+                                  int size, void *digest, int digest_size,
+                                  enum drbd_packet cmd);
  extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size);
  
  extern int drbd_send_bitmap(struct drbd_conf *mdev);
- extern int _drbd_send_bitmap(struct drbd_conf *mdev);
- extern int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode);
+ extern void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode);
+ extern void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode);
  extern void drbd_free_bc(struct drbd_backing_dev *ldev);
  extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
  void drbd_print_uuids(struct drbd_conf *mdev, const char *text);
  
+ extern void conn_md_sync(struct drbd_tconn *tconn);
  extern void drbd_md_sync(struct drbd_conf *mdev);
  extern int  drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
  extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
@@@ -1334,33 -1144,52 +1143,52 @@@ extern void drbd_queue_bitmap_io(struc
  extern int drbd_bitmap_io(struct drbd_conf *mdev,
                int (*io_fn)(struct drbd_conf *),
                char *why, enum bm_flag flags);
+ extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
+               int (*io_fn)(struct drbd_conf *),
+               char *why, enum bm_flag flags);
  extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
  extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
  extern void drbd_go_diskless(struct drbd_conf *mdev);
  extern void drbd_ldev_destroy(struct drbd_conf *mdev);
  
  /* Meta data layout
     We reserve a 128MB Block (4k aligned)
     * either at the end of the backing device
     * or on a separate meta data device. */
  
- #define MD_RESERVED_SECT (128LU << 11)  /* 128 MB, unit sectors */
  /* The following numbers are sectors */
- #define MD_AL_OFFSET 8            /* 8 Sectors after start of meta area */
- #define MD_AL_MAX_SIZE 64   /* = 32 kb LOG  ~ 3776 extents ~ 14 GB Storage */
- /* Allows up to about 3.8TB */
- #define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE)
- /* Since the smalles IO unit is usually 512 byte */
- #define MD_SECTOR_SHIFT        9
- #define MD_SECTOR_SIZE         (1<<MD_SECTOR_SHIFT)
- /* activity log */
- #define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */
- #define AL_EXTENT_SHIFT 22             /* One extent represents 4M Storage */
+ /* Allows up to about 3.8TB, so if you want more,
+  * you need to use the "flexible" meta data format. */
+ #define MD_RESERVED_SECT (128LU << 11)  /* 128 MB, unit sectors */
+ #define MD_AL_OFFSET  8    /* 8 Sectors after start of meta area */
+ #define MD_AL_SECTORS 64   /* = 32 kB on disk activity log ring buffer */
+ #define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS)
+ /* we do all meta data IO in 4k blocks */
+ #define MD_BLOCK_SHIFT        12
+ #define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT)
+ /* One activity log extent represents 4M of storage */
+ #define AL_EXTENT_SHIFT 22
  #define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
  
+ /* We could make these currently hardcoded constants configurable
+  * variables at create-md time (or even re-configurable at runtime?).
+  * Which will require some more changes to the DRBD "super block"
+  * and attach code.
+  *
+  * updates per transaction:
+  *   This many changes to the active set can be logged with one transaction.
+  *   This number is arbitrary.
+  * context per transaction:
+  *   This many context extent numbers are logged with each transaction.
+  *   This number is resulting from the transaction block size (4k), the layout
+  *   of the transaction header, and the number of updates per transaction.
+  *   See drbd_actlog.c:struct al_transaction_on_disk
+  * */
+ #define AL_UPDATES_PER_TRANSACTION     64     // arbitrary
+ #define AL_CONTEXT_PER_TRANSACTION    919     // (4096 - 36 - 6*64)/4
  #if BITS_PER_LONG == 32
  #define LN2_BPL 5
  #define cpu_to_lel(A) cpu_to_le32(A)
@@@ -1396,11 -1225,14 +1224,14 @@@ struct bm_extent 
  
  #define SLEEP_TIME (HZ/10)
  
- #define BM_BLOCK_SHIFT  12                     /* 4k per bit */
+ /* We do bitmap IO in units of 4k blocks.
+  * We also still have a hardcoded 4k per bit relation. */
+ #define BM_BLOCK_SHIFT        12                       /* 4k per bit */
  #define BM_BLOCK_SIZE  (1<<BM_BLOCK_SHIFT)
- /* (9+3) : 512 bytes @ 8 bits; representing 16M storage
-  * per sector of on disk bitmap */
- #define BM_EXT_SHIFT   (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3)  /* = 24 */
+ /* mostly arbitrarily set the represented size of one bitmap extent,
+  * aka resync extent, to 16 MiB (which is also 512 Byte worth of bitmap
+  * at 4k per bit resolution) */
+ #define BM_EXT_SHIFT   24     /* 16 MiB per resync extent */
  #define BM_EXT_SIZE    (1<<BM_EXT_SHIFT)
  
  #if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
  #endif
  #endif
  
- /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
-  * With a value of 8 all IO in one 128K block make it to the same slot of the
-  * hash table. */
- #define HT_SHIFT 8
- #define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT))
+ /* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE,
+  * so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte.
+  * Since we may live in a mixed-platform cluster,
+  * we limit us to a platform agnostic constant here for now.
+  * A followup commit may allow even bigger BIO sizes,
+  * once we thought that through. */
 -#define DRBD_MAX_BIO_SIZE (1 << 20)
++#define DRBD_MAX_BIO_SIZE (1U << 20)
+ #if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+ #error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+ #endif
 -#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12)       /* Works always = 4k */
 +#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12)       /* Works always = 4k */
  
- #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* The old header only allows packets up to 32Kib data */
- /* Number of elements in the app_reads_hash */
- #define APP_R_HSIZE 15
 -#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* Header 80 only allows packets up to 32KiB data */
 -#define DRBD_MAX_BIO_SIZE_P95    (1 << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
++#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
++#define DRBD_MAX_BIO_SIZE_P95    (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
  
  extern int  drbd_bm_init(struct drbd_conf *mdev);
  extern int  drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits);
@@@ -1500,11 -1335,11 +1334,11 @@@ extern int  drbd_bm_test_bit(struct drb
  extern int  drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
  extern int  drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
  extern int  drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
+ extern void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr);
  extern int  drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
+ extern int  drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local);
  extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local);
  extern int  drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
- extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
-               unsigned long al_enr);
  extern size_t      drbd_bm_words(struct drbd_conf *mdev);
  extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
  extern sector_t      drbd_bm_capacity(struct drbd_conf *mdev);
@@@ -1529,7 -1364,7 +1363,7 @@@ extern void drbd_bm_unlock(struct drbd_
  /* drbd_main.c */
  
  extern struct kmem_cache *drbd_request_cache;
- extern struct kmem_cache *drbd_ee_cache;      /* epoch entries */
+ extern struct kmem_cache *drbd_ee_cache;      /* peer requests */
  extern struct kmem_cache *drbd_bm_ext_cache;  /* bitmap extents */
  extern struct kmem_cache *drbd_al_ext_cache;  /* activity log extents */
  extern mempool_t *drbd_request_mempool;
@@@ -1569,23 -1404,34 +1403,34 @@@ extern struct bio *bio_alloc_drbd(gfp_
  
  extern rwlock_t global_state_lock;
  
- extern struct drbd_conf *drbd_new_device(unsigned int minor);
- extern void drbd_free_mdev(struct drbd_conf *mdev);
+ extern int conn_lowest_minor(struct drbd_tconn *tconn);
+ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr);
+ extern void drbd_minor_destroy(struct kref *kref);
+ extern int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts);
+ extern struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts);
+ extern void conn_destroy(struct kref *kref);
+ struct drbd_tconn *conn_get_by_name(const char *name);
+ extern struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len,
+                                           void *peer_addr, int peer_addr_len);
+ extern void conn_free_crypto(struct drbd_tconn *tconn);
  
  extern int proc_details;
  
  /* drbd_req */
 -extern int drbd_make_request(struct request_queue *q, struct bio *bio);
+ extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long);
 +extern void drbd_make_request(struct request_queue *q, struct bio *bio);
  extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
  extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
  extern int is_valid_ar_handle(struct drbd_request *, sector_t);
  
  
  /* drbd_nl.c */
+ extern int drbd_msg_put_info(const char *info);
  extern void drbd_suspend_io(struct drbd_conf *mdev);
  extern void drbd_resume_io(struct drbd_conf *mdev);
  extern char *ppsize(char *buf, unsigned long long size);
- extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
+ extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int);
  enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
  extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
  extern void resync_after_online_grow(struct drbd_conf *);
@@@ -1593,13 -1439,14 +1438,14 @@@ extern void drbd_reconsider_max_bio_siz
  extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
                                        enum drbd_role new_role,
                                        int force);
- extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
- extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
+ extern bool conn_try_outdate_peer(struct drbd_tconn *tconn);
+ extern void conn_try_outdate_peer_async(struct drbd_tconn *tconn);
  extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
  
  /* drbd_worker.c */
  extern int drbd_worker(struct drbd_thread *thi);
- extern int drbd_alter_sa(struct drbd_conf *mdev, int na);
+ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor);
+ void drbd_resync_after_changed(struct drbd_conf *mdev);
  extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side);
  extern void resume_next_sg(struct drbd_conf *mdev);
  extern void suspend_other_sg(struct drbd_conf *mdev);
@@@ -1608,13 -1455,13 +1454,13 @@@ extern int drbd_resync_finished(struct 
  extern void *drbd_md_get_buffer(struct drbd_conf *mdev);
  extern void drbd_md_put_buffer(struct drbd_conf *mdev);
  extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
-                               struct drbd_backing_dev *bdev, sector_t sector, int rw);
+               struct drbd_backing_dev *bdev, sector_t sector, int rw);
+ extern void drbd_ov_out_of_sync_found(struct drbd_conf *, sector_t, int);
  extern void wait_until_done_or_force_detached(struct drbd_conf *mdev,
                struct drbd_backing_dev *bdev, unsigned int *done);
- extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
  extern void drbd_rs_controller_reset(struct drbd_conf *mdev);
  
- static inline void ov_oos_print(struct drbd_conf *mdev)
+ static inline void ov_out_of_sync_print(struct drbd_conf *mdev)
  {
        if (mdev->ov_last_oos_size) {
                dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n",
  
  
  extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
- extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *);
+ extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *,
+                        struct drbd_peer_request *, void *);
  /* worker callbacks */
- extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
- extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
- extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int);
- extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int);
- extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
- extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
- extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
- extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
- extern int w_resync_timer(struct drbd_conf *, struct drbd_work *, int);
- extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
- extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
- extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
- extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
- extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
- extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
- extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
- extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
- extern int w_send_oos(struct drbd_conf *, struct drbd_work *, int);
- extern int w_start_resync(struct drbd_conf *, struct drbd_work *, int);
+ extern int w_e_end_data_req(struct drbd_work *, int);
+ extern int w_e_end_rsdata_req(struct drbd_work *, int);
+ extern int w_e_end_csum_rs_req(struct drbd_work *, int);
+ extern int w_e_end_ov_reply(struct drbd_work *, int);
+ extern int w_e_end_ov_req(struct drbd_work *, int);
+ extern int w_ov_finished(struct drbd_work *, int);
+ extern int w_resync_timer(struct drbd_work *, int);
+ extern int w_send_write_hint(struct drbd_work *, int);
+ extern int w_make_resync_request(struct drbd_work *, int);
+ extern int w_send_dblock(struct drbd_work *, int);
+ extern int w_send_read_req(struct drbd_work *, int);
+ extern int w_prev_work_done(struct drbd_work *, int);
+ extern int w_e_reissue(struct drbd_work *, int);
+ extern int w_restart_disk_io(struct drbd_work *, int);
+ extern int w_send_out_of_sync(struct drbd_work *, int);
+ extern int w_start_resync(struct drbd_work *, int);
  
  extern void resync_timer_fn(unsigned long data);
  extern void start_resync_timer_fn(unsigned long data);
  
  /* drbd_receiver.c */
  extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector);
- extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
-               const unsigned rw, const int fault_type);
- extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
- extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
-                                           u64 id,
-                                           sector_t sector,
-                                           unsigned int data_size,
-                                           gfp_t gfp_mask) __must_hold(local);
- extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
-               int is_net);
- #define drbd_free_ee(m,e)     drbd_free_some_ee(m, e, 0)
- #define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1)
- extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
-               struct list_head *head);
- extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
-               struct list_head *head);
+ extern int drbd_submit_peer_request(struct drbd_conf *,
+                                   struct drbd_peer_request *, const unsigned,
+                                   const int);
+ extern int drbd_free_peer_reqs(struct drbd_conf *, struct list_head *);
+ extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_conf *, u64,
+                                                    sector_t, unsigned int,
+                                                    gfp_t) __must_hold(local);
+ extern void __drbd_free_peer_req(struct drbd_conf *, struct drbd_peer_request *,
+                                int);
+ #define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0)
+ #define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1)
+ extern struct page *drbd_alloc_pages(struct drbd_conf *, unsigned int, bool);
  extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
  extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
- extern void drbd_flush_workqueue(struct drbd_conf *mdev);
- extern void drbd_free_tl_hash(struct drbd_conf *mdev);
+ extern void conn_flush_workqueue(struct drbd_tconn *tconn);
+ extern int drbd_connected(struct drbd_conf *mdev);
+ static inline void drbd_flush_workqueue(struct drbd_conf *mdev)
+ {
+       conn_flush_workqueue(mdev->tconn);
+ }
  
- /* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
-  * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
+ /* Yes, there is kernel_setsockopt, but only since 2.6.18.
+  * So we have our own copy of it here. */
  static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
-                       char __user *optval, int optlen)
+                                 char *optval, int optlen)
  {
+       mm_segment_t oldfs = get_fs();
+       char __user *uoptval;
        int err;
+       uoptval = (char __user __force *)optval;
+       set_fs(KERNEL_DS);
        if (level == SOL_SOCKET)
-               err = sock_setsockopt(sock, level, optname, optval, optlen);
+               err = sock_setsockopt(sock, level, optname, uoptval, optlen);
        else
-               err = sock->ops->setsockopt(sock, level, optname, optval,
+               err = sock->ops->setsockopt(sock, level, optname, uoptval,
                                            optlen);
+       set_fs(oldfs);
        return err;
  }
  
  static inline void drbd_tcp_cork(struct socket *sock)
  {
-       int __user val = 1;
+       int val = 1;
        (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
-                       (char __user *)&val, sizeof(val));
+                       (char*)&val, sizeof(val));
  }
  
  static inline void drbd_tcp_uncork(struct socket *sock)
  {
-       int __user val = 0;
+       int val = 0;
        (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
-                       (char __user *)&val, sizeof(val));
+                       (char*)&val, sizeof(val));
  }
  
  static inline void drbd_tcp_nodelay(struct socket *sock)
  {
-       int __user val = 1;
+       int val = 1;
        (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
-                       (char __user *)&val, sizeof(val));
+                       (char*)&val, sizeof(val));
  }
  
  static inline void drbd_tcp_quickack(struct socket *sock)
  {
-       int __user val = 2;
+       int val = 2;
        (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
-                       (char __user *)&val, sizeof(val));
+                       (char*)&val, sizeof(val));
  }
  
- void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo);
+ void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo);
  
  /* drbd_proc.c */
  extern struct proc_dir_entry *drbd_proc;
@@@ -1725,8 -1577,8 +1576,8 @@@ extern const char *drbd_conn_str(enum d
  extern const char *drbd_role_str(enum drbd_role s);
  
  /* drbd_actlog.c */
- extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector);
- extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector);
+ extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i);
+ extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i);
  extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
  extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
  extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
@@@ -1734,7 -1586,6 +1585,6 @@@ extern void drbd_rs_cancel_all(struct d
  extern int drbd_rs_del_all(struct drbd_conf *mdev);
  extern void drbd_rs_failed_io(struct drbd_conf *mdev,
                sector_t sector, int size);
- extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
  extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go);
  extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
                int size, const char *file, const unsigned int line);
@@@ -1744,73 -1595,24 +1594,24 @@@ extern int __drbd_set_out_of_sync(struc
                int size, const char *file, const unsigned int line);
  #define drbd_set_out_of_sync(mdev, sector, size) \
        __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
- extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
  extern void drbd_al_shrink(struct drbd_conf *mdev);
  
  /* drbd_nl.c */
- void drbd_nl_cleanup(void);
- int __init drbd_nl_init(void);
- void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state);
- void drbd_bcast_sync_progress(struct drbd_conf *mdev);
- void drbd_bcast_ee(struct drbd_conf *mdev,
-               const char *reason, const int dgs,
-               const char* seen_hash, const char* calc_hash,
-               const struct drbd_epoch_entry* e);
- /**
-  * DOC: DRBD State macros
-  *
-  * These macros are used to express state changes in easily readable form.
-  *
-  * The NS macros expand to a mask and a value, that can be bit ored onto the
-  * current state as soon as the spinlock (req_lock) was taken.
-  *
-  * The _NS macros are used for state functions that get called with the
-  * spinlock. These macros expand directly to the new state value.
-  *
-  * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
-  * to express state changes that affect more than one aspect of the state.
-  *
-  * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
-  * Means that the network connection was established and that the peer
-  * is in secondary role.
-  */
- #define role_MASK R_MASK
- #define peer_MASK R_MASK
- #define disk_MASK D_MASK
- #define pdsk_MASK D_MASK
- #define conn_MASK C_MASK
- #define susp_MASK 1
- #define user_isp_MASK 1
- #define aftr_isp_MASK 1
- #define susp_nod_MASK 1
- #define susp_fen_MASK 1
- #define NS(T, S) \
-       ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
-       ({ union drbd_state val; val.i = 0; val.T = (S); val; })
- #define NS2(T1, S1, T2, S2) \
-       ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
-         mask.T2 = T2##_MASK; mask; }), \
-       ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
-         val.T2 = (S2); val; })
- #define NS3(T1, S1, T2, S2, T3, S3) \
-       ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
-         mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
-       ({ union drbd_state val;  val.i = 0; val.T1 = (S1); \
-         val.T2 = (S2); val.T3 = (S3); val; })
- #define _NS(D, T, S) \
-       D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; })
- #define _NS2(D, T1, S1, T2, S2) \
-       D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
-       __ns.T2 = (S2); __ns; })
- #define _NS3(D, T1, S1, T2, S2, T3, S3) \
-       D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
-       __ns.T2 = (S2); __ns.T3 = (S3); __ns; })
+ /* state info broadcast */
+ struct sib_info {
+       enum drbd_state_info_bcast_reason sib_reason;
+       union {
+               struct {
+                       char *helper_name;
+                       unsigned helper_exit_code;
+               };
+               struct {
+                       union drbd_state os;
+                       union drbd_state ns;
+               };
+       };
+ };
+ void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib);
  
  /*
   * inline helper functions
@@@ -1827,9 -1629,10 +1628,10 @@@ static inline struct page *page_chain_n
  #define page_chain_for_each_safe(page, n) \
        for (; page && ({ n = page_chain_next(page); 1; }); page = n)
  
- static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
+ static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req)
  {
-       struct page *page = e->pages;
+       struct page *page = peer_req->pages;
        page_chain_for_each(page) {
                if (page_count(page) > 1)
                        return 1;
        return 0;
  }
  
- static inline void drbd_state_lock(struct drbd_conf *mdev)
- {
-       wait_event(mdev->misc_wait,
-                  !drbd_test_and_set_flag(mdev, CLUSTER_ST_CHANGE));
- }
- static inline void drbd_state_unlock(struct drbd_conf *mdev)
- {
-       drbd_clear_flag(mdev, CLUSTER_ST_CHANGE);
-       wake_up(&mdev->misc_wait);
- }
  static inline enum drbd_state_rv
  _drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
                enum chg_state_flags flags, struct completion *done)
        return rv;
  }
  
- /**
-  * drbd_request_state() - Reqest a state change
-  * @mdev:     DRBD device.
-  * @mask:     mask of state bits to change.
-  * @val:      value of new state bits.
-  *
-  * This is the most graceful way of requesting a state change. It is verbose
-  * quite verbose in case the state change is not possible, and all those
-  * state changes are globally serialized.
-  */
- static inline int drbd_request_state(struct drbd_conf *mdev,
-                                    union drbd_state mask,
-                                    union drbd_state val)
+ static inline union drbd_state drbd_read_state(struct drbd_conf *mdev)
  {
-       return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
+       union drbd_state rv;
+       rv.i = mdev->state.i;
+       rv.susp = mdev->tconn->susp;
+       rv.susp_nod = mdev->tconn->susp_nod;
+       rv.susp_fen = mdev->tconn->susp_fen;
+       return rv;
  }
  
  enum drbd_force_detach_flags {
@@@ -1891,8 -1677,13 +1676,13 @@@ static inline void __drbd_chk_io_error_
                enum drbd_force_detach_flags df,
                const char *where)
  {
-       switch (mdev->ldev->dc.on_io_error) {
-       case EP_PASS_ON:
+       enum drbd_io_error_p ep;
+       rcu_read_lock();
+       ep = rcu_dereference(mdev->ldev->disk_conf)->on_io_error;
+       rcu_read_unlock();
+       switch (ep) {
+       case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
                if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
                        if (__ratelimit(&drbd_ratelimit_state))
                                dev_err(DEV, "Local IO failed in %s.\n", where);
                 * we read meta data only once during attach,
                 * which will fail in case of errors.
                 */
-               drbd_set_flag(mdev, WAS_IO_ERROR);
+               set_bit(WAS_IO_ERROR, &mdev->flags);
                if (df == DRBD_READ_ERROR)
-                       drbd_set_flag(mdev, WAS_READ_ERROR);
+                       set_bit(WAS_READ_ERROR, &mdev->flags);
                if (df == DRBD_FORCE_DETACH)
-                       drbd_set_flag(mdev, FORCE_DETACH);
+                       set_bit(FORCE_DETACH, &mdev->flags);
                if (mdev->state.disk > D_FAILED) {
                        _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
                        dev_err(DEV,
@@@ -1951,9 -1742,9 +1741,9 @@@ static inline void drbd_chk_io_error_(s
  {
        if (error) {
                unsigned long flags;
-               spin_lock_irqsave(&mdev->req_lock, flags);
+               spin_lock_irqsave(&mdev->tconn->req_lock, flags);
                __drbd_chk_io_error_(mdev, forcedetach, where);
-               spin_unlock_irqrestore(&mdev->req_lock, flags);
+               spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
        }
  }
  
   * BTW, for internal meta data, this happens to be the maximum capacity
   * we could agree upon with our peer node.
   */
- static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
+ static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev)
  {
-       switch (bdev->dc.meta_dev_idx) {
+       switch (meta_dev_idx) {
        case DRBD_MD_INDEX_INTERNAL:
        case DRBD_MD_INDEX_FLEX_INT:
                return bdev->md.md_offset + bdev->md.bm_offset;
        }
  }
  
+ static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
+ {
+       int meta_dev_idx;
+       rcu_read_lock();
+       meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+       rcu_read_unlock();
+       return _drbd_md_first_sector(meta_dev_idx, bdev);
+ }
  /**
   * drbd_md_last_sector() - Return the last sector number of the meta data area
   * @bdev:     Meta data block device.
   */
  static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
  {
-       switch (bdev->dc.meta_dev_idx) {
+       int meta_dev_idx;
+       rcu_read_lock();
+       meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+       rcu_read_unlock();
+       switch (meta_dev_idx) {
        case DRBD_MD_INDEX_INTERNAL:
        case DRBD_MD_INDEX_FLEX_INT:
                return bdev->md.md_offset + MD_AL_OFFSET - 1;
@@@ -2011,12 -1819,18 +1818,18 @@@ static inline sector_t drbd_get_capacit
  static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
  {
        sector_t s;
-       switch (bdev->dc.meta_dev_idx) {
+       int meta_dev_idx;
+       rcu_read_lock();
+       meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+       rcu_read_unlock();
+       switch (meta_dev_idx) {
        case DRBD_MD_INDEX_INTERNAL:
        case DRBD_MD_INDEX_FLEX_INT:
                s = drbd_get_capacity(bdev->backing_bdev)
                        ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
-                                       drbd_md_first_sector(bdev))
+                               _drbd_md_first_sector(meta_dev_idx, bdev))
                        : 0;
                break;
        case DRBD_MD_INDEX_FLEX_EXT:
  static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
                                    struct drbd_backing_dev *bdev)
  {
-       switch (bdev->dc.meta_dev_idx) {
+       int meta_dev_idx;
+       rcu_read_lock();
+       meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+       rcu_read_unlock();
+       switch (meta_dev_idx) {
        default: /* external, some index */
-               return MD_RESERVED_SECT * bdev->dc.meta_dev_idx;
+               return MD_RESERVED_SECT * meta_dev_idx;
        case DRBD_MD_INDEX_INTERNAL:
                /* with drbd08, internal meta data is always "flexible" */
        case DRBD_MD_INDEX_FLEX_INT:
@@@ -2070,9 -1890,8 +1889,8 @@@ drbd_queue_work_front(struct drbd_work_
        unsigned long flags;
        spin_lock_irqsave(&q->q_lock, flags);
        list_add(&w->list, &q->q);
-       up(&q->s); /* within the spinlock,
-                     see comment near end of drbd_worker() */
        spin_unlock_irqrestore(&q->q_lock, flags);
+       wake_up(&q->q_wait);
  }
  
  static inline void
@@@ -2081,41 -1900,35 +1899,35 @@@ drbd_queue_work(struct drbd_work_queue 
        unsigned long flags;
        spin_lock_irqsave(&q->q_lock, flags);
        list_add_tail(&w->list, &q->q);
-       up(&q->s); /* within the spinlock,
-                     see comment near end of drbd_worker() */
        spin_unlock_irqrestore(&q->q_lock, flags);
+       wake_up(&q->q_wait);
  }
  
- static inline void wake_asender(struct drbd_conf *mdev)
- {
-       if (drbd_test_flag(mdev, SIGNAL_ASENDER))
-               force_sig(DRBD_SIG, mdev->asender.task);
- }
- static inline void request_ping(struct drbd_conf *mdev)
+ static inline void wake_asender(struct drbd_tconn *tconn)
  {
-       drbd_set_flag(mdev, SEND_PING);
-       wake_asender(mdev);
+       if (test_bit(SIGNAL_ASENDER, &tconn->flags))
+               force_sig(DRBD_SIG, tconn->asender.task);
  }
  
- static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
-       enum drbd_packets cmd)
+ static inline void request_ping(struct drbd_tconn *tconn)
  {
-       struct p_header80 h;
-       return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
+       set_bit(SEND_PING, &tconn->flags);
+       wake_asender(tconn);
  }
  
- static inline int drbd_send_ping(struct drbd_conf *mdev)
- {
-       struct p_header80 h;
-       return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
- }
+ extern void *conn_prepare_command(struct drbd_tconn *, struct drbd_socket *);
+ extern void *drbd_prepare_command(struct drbd_conf *, struct drbd_socket *);
+ extern int conn_send_command(struct drbd_tconn *, struct drbd_socket *,
+                            enum drbd_packet, unsigned int, void *,
+                            unsigned int);
+ extern int drbd_send_command(struct drbd_conf *, struct drbd_socket *,
+                            enum drbd_packet, unsigned int, void *,
+                            unsigned int);
  
- static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
- {
-       struct p_header80 h;
-       return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
- }
+ extern int drbd_send_ping(struct drbd_tconn *tconn);
+ extern int drbd_send_ping_ack(struct drbd_tconn *tconn);
+ extern int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state);
+ extern int conn_send_state_req(struct drbd_tconn *, union drbd_state, union drbd_state);
  
  static inline void drbd_thread_stop(struct drbd_thread *thi)
  {
@@@ -2137,21 -1950,21 +1949,21 @@@ static inline void drbd_thread_restart_
   * or implicit barrier packets as necessary.
   * increased:
   *  w_send_barrier
-  *  _req_mod(req, queue_for_net_write or queue_for_net_read);
+  *  _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ);
   *    it is much easier and equally valid to count what we queue for the
   *    worker, even before it actually was queued or send.
   *    (drbd_make_request_common; recovery path on read io-error)
   * decreased:
   *  got_BarrierAck (respective tl_clear, tl_clear_barrier)
-  *  _req_mod(req, data_received)
+  *  _req_mod(req, DATA_RECEIVED)
   *     [from receive_DataReply]
-  *  _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked)
+  *  _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED)
   *     [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
   *     for some reason it is NOT decreased in got_NegAck,
   *     but in the resulting cleanup code from report_params.
   *     we should try to remember the reason for that...
-  *  _req_mod(req, send_failed or send_canceled)
-  *  _req_mod(req, connection_lost_while_pending)
+  *  _req_mod(req, SEND_FAILED or SEND_CANCELED)
+  *  _req_mod(req, CONNECTION_LOST_WHILE_PENDING)
   *     [from tl_clear_barrier]
   */
  static inline void inc_ap_pending(struct drbd_conf *mdev)
        atomic_inc(&mdev->ap_pending_cnt);
  }
  
- #define ERR_IF_CNT_IS_NEGATIVE(which)                         \
-       if (atomic_read(&mdev->which) < 0)                      \
+ #define ERR_IF_CNT_IS_NEGATIVE(which, func, line)                     \
+       if (atomic_read(&mdev->which) < 0)                              \
                dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n",       \
-                   __func__ , __LINE__ ,                       \
-                   atomic_read(&mdev->which))
+                       func, line,                                     \
+                       atomic_read(&mdev->which))
  
- #define dec_ap_pending(mdev)  do {                            \
-       typecheck(struct drbd_conf *, mdev);                    \
-       if (atomic_dec_and_test(&mdev->ap_pending_cnt))         \
-               wake_up(&mdev->misc_wait);                      \
-       ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0)
+ #define dec_ap_pending(mdev) _dec_ap_pending(mdev, __FUNCTION__, __LINE__)
+ static inline void _dec_ap_pending(struct drbd_conf *mdev, const char *func, int line)
+ {
+       if (atomic_dec_and_test(&mdev->ap_pending_cnt))
+               wake_up(&mdev->misc_wait);
+       ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line);
+ }
  
  /* counts how many resync-related answers we still expect from the peer
   *                 increase                   decrease
@@@ -2182,10 -1997,12 +1996,12 @@@ static inline void inc_rs_pending(struc
        atomic_inc(&mdev->rs_pending_cnt);
  }
  
- #define dec_rs_pending(mdev)  do {                            \
-       typecheck(struct drbd_conf *, mdev);                    \
-       atomic_dec(&mdev->rs_pending_cnt);                      \
-       ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0)
+ #define dec_rs_pending(mdev) _dec_rs_pending(mdev, __FUNCTION__, __LINE__)
+ static inline void _dec_rs_pending(struct drbd_conf *mdev, const char *func, int line)
+ {
+       atomic_dec(&mdev->rs_pending_cnt);
+       ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line);
+ }
  
  /* counts how many answers we still need to send to the peer.
   * increased on
@@@ -2201,38 -2018,18 +2017,18 @@@ static inline void inc_unacked(struct d
        atomic_inc(&mdev->unacked_cnt);
  }
  
- #define dec_unacked(mdev)     do {                            \
-       typecheck(struct drbd_conf *, mdev);                    \
-       atomic_dec(&mdev->unacked_cnt);                         \
-       ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
- #define sub_unacked(mdev, n)  do {                            \
-       typecheck(struct drbd_conf *, mdev);                    \
-       atomic_sub(n, &mdev->unacked_cnt);                      \
-       ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
- static inline void put_net_conf(struct drbd_conf *mdev)
+ #define dec_unacked(mdev) _dec_unacked(mdev, __FUNCTION__, __LINE__)
+ static inline void _dec_unacked(struct drbd_conf *mdev, const char *func, int line)
  {
-       if (atomic_dec_and_test(&mdev->net_cnt))
-               wake_up(&mdev->net_cnt_wait);
+       atomic_dec(&mdev->unacked_cnt);
+       ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
  }
  
- /**
-  * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there
-  * @mdev:     DRBD device.
-  *
-  * You have to call put_net_conf() when finished working with mdev->net_conf.
-  */
- static inline int get_net_conf(struct drbd_conf *mdev)
+ #define sub_unacked(mdev, n) _sub_unacked(mdev, n, __FUNCTION__, __LINE__)
+ static inline void _sub_unacked(struct drbd_conf *mdev, int n, const char *func, int line)
  {
-       int have_net_conf;
-       atomic_inc(&mdev->net_cnt);
-       have_net_conf = mdev->state.conn >= C_UNCONNECTED;
-       if (!have_net_conf)
-               put_net_conf(mdev);
-       return have_net_conf;
+       atomic_sub(n, &mdev->unacked_cnt);
+       ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
  }
  
  /**
@@@ -2336,17 -2133,20 +2132,20 @@@ static inline void drbd_get_syncer_prog
   * maybe re-implement using semaphores? */
  static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
  {
-       int mxb = 1000000; /* arbitrary limit on open requests */
-       if (get_net_conf(mdev)) {
-               mxb = mdev->net_conf->max_buffers;
-               put_net_conf(mdev);
-       }
+       struct net_conf *nc;
+       int mxb;
+       rcu_read_lock();
+       nc = rcu_dereference(mdev->tconn->net_conf);
+       mxb = nc ? nc->max_buffers : 1000000;  /* arbitrary limit on open requests */
+       rcu_read_unlock();
        return mxb;
  }
  
  static inline int drbd_state_is_stable(struct drbd_conf *mdev)
  {
-       union drbd_state s = mdev->state;
+       union drbd_dev_state s = mdev->state;
  
        /* DO NOT add a default clause, we want the compiler to warn us
         * for any newly introduced state we may have forgotten to add here */
  
                /* Allow IO in BM exchange states with new protocols */
        case C_WF_BITMAP_S:
-               if (mdev->agreed_pro_version < 96)
+               if (mdev->tconn->agreed_pro_version < 96)
                        return 0;
                break;
  
                /* disk state is stable as well. */
                break;
  
-       /* no new io accepted during tansitional states */
+       /* no new io accepted during transitional states */
        case D_ATTACHING:
        case D_NEGOTIATING:
        case D_UNKNOWN:
        return 1;
  }
  
- static inline int is_susp(union drbd_state s)
+ static inline int drbd_suspended(struct drbd_conf *mdev)
  {
-       return s.susp || s.susp_nod || s.susp_fen;
+       struct drbd_tconn *tconn = mdev->tconn;
+       return tconn->susp || tconn->susp_fen || tconn->susp_nod;
  }
  
  static inline bool may_inc_ap_bio(struct drbd_conf *mdev)
  {
        int mxb = drbd_get_max_buffers(mdev);
  
-       if (is_susp(mdev->state))
+       if (drbd_suspended(mdev))
                return false;
-       if (drbd_test_flag(mdev, SUSPEND_IO))
+       if (test_bit(SUSPEND_IO, &mdev->flags))
                return false;
  
        /* to avoid potential deadlock or bitmap corruption,
         * and we are within the spinlock anyways, we have this workaround.  */
        if (atomic_read(&mdev->ap_bio_cnt) > mxb)
                return false;
-       if (drbd_test_flag(mdev, BITMAP_IO))
+       if (test_bit(BITMAP_IO, &mdev->flags))
                return false;
        return true;
  }
  
- static inline bool inc_ap_bio_cond(struct drbd_conf *mdev, int count)
+ static inline bool inc_ap_bio_cond(struct drbd_conf *mdev)
  {
        bool rv = false;
  
-       spin_lock_irq(&mdev->req_lock);
+       spin_lock_irq(&mdev->tconn->req_lock);
        rv = may_inc_ap_bio(mdev);
        if (rv)
-               atomic_add(count, &mdev->ap_bio_cnt);
-       spin_unlock_irq(&mdev->req_lock);
+               atomic_inc(&mdev->ap_bio_cnt);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
        return rv;
  }
  
- static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
+ static inline void inc_ap_bio(struct drbd_conf *mdev)
  {
        /* we wait here
         *    as long as the device is suspended
         *    until the bitmap is no longer on the fly during connection
-        *    handshake as long as we would exeed the max_buffer limit.
+        *    handshake as long as we would exceed the max_buffer limit.
         *
         * to avoid races with the reconnect code,
         * we need to atomic_inc within the spinlock. */
  
-       wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev, count));
+       wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev));
  }
  
  static inline void dec_ap_bio(struct drbd_conf *mdev)
  
        D_ASSERT(ap_bio >= 0);
  
-       if (ap_bio == 0 && drbd_test_flag(mdev, BITMAP_IO)) {
-               if (!drbd_test_and_set_flag(mdev, BITMAP_IO_QUEUED))
-                       drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+       if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
+               if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
+                       drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w);
        }
  
        /* this currently does wake_up for every dec_ap_bio!
                wake_up(&mdev->misc_wait);
  }
  
+ static inline bool verify_can_do_stop_sector(struct drbd_conf *mdev)
+ {
+       return mdev->tconn->agreed_pro_version >= 97 &&
+               mdev->tconn->agreed_pro_version != 100;
+ }
  static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
  {
        int changed = mdev->ed_uuid != val;
        return changed;
  }
  
- static inline int seq_cmp(u32 a, u32 b)
- {
-       /* we assume wrap around at 32bit.
-        * for wrap around at 24bit (old atomic_t),
-        * we'd have to
-        *  a <<= 8; b <<= 8;
-        */
-       return (s32)(a) - (s32)(b);
- }
- #define seq_lt(a, b) (seq_cmp((a), (b)) < 0)
- #define seq_gt(a, b) (seq_cmp((a), (b)) > 0)
- #define seq_ge(a, b) (seq_cmp((a), (b)) >= 0)
- #define seq_le(a, b) (seq_cmp((a), (b)) <= 0)
- /* CAUTION: please no side effects in arguments! */
- #define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b)))
- static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq)
- {
-       unsigned int m;
-       spin_lock(&mdev->peer_seq_lock);
-       m = seq_max(mdev->peer_seq, new_seq);
-       mdev->peer_seq = m;
-       spin_unlock(&mdev->peer_seq_lock);
-       if (m == new_seq)
-               wake_up(&mdev->seq_wait);
- }
- static inline void drbd_update_congested(struct drbd_conf *mdev)
- {
-       struct sock *sk = mdev->data.socket->sk;
-       if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
-               drbd_set_flag(mdev, NET_CONGESTED);
- }
  static inline int drbd_queue_order_type(struct drbd_conf *mdev)
  {
        /* sorry, we currently have no working implementation
@@@ -2545,15 -2319,46 +2318,46 @@@ static inline void drbd_md_flush(struc
  {
        int r;
  
-       if (drbd_test_flag(mdev, MD_NO_FUA))
+       if (mdev->ldev == NULL) {
+               dev_warn(DEV, "mdev->ldev == NULL in drbd_md_flush\n");
+               return;
+       }
+       if (test_bit(MD_NO_FUA, &mdev->flags))
                return;
  
        r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_NOIO, NULL);
        if (r) {
-               drbd_set_flag(mdev, MD_NO_FUA);
+               set_bit(MD_NO_FUA, &mdev->flags);
                dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
        }
  }
  
  #endif
+ /* This is defined in drivers/md/md.h as well. Should go into wait.h */
+ #define __wait_event_lock_irq(wq, condition, lock, cmd)               \
+ do {                                                                  \
+       wait_queue_t __wait;                                            \
+       init_waitqueue_entry(&__wait, current);                         \
+                                                                       \
+       add_wait_queue(&wq, &__wait);                                   \
+       for (;;) {                                                      \
+               set_current_state(TASK_UNINTERRUPTIBLE);                \
+               if (condition)                                          \
+                       break;                                          \
+               spin_unlock_irq(&lock);                                 \
+               cmd;                                                    \
+               schedule();                                             \
+               spin_lock_irq(&lock);                                   \
+       }                                                               \
+       current->state = TASK_RUNNING;                                  \
+       remove_wait_queue(&wq, &__wait);                                \
+ } while (0)
+ #define wait_event_lock_irq(wq, condition, lock, cmd)                         \
+ do {                                                                  \
+       if (condition)                                                  \
+               break;                                                  \
+       __wait_event_lock_irq(wq, condition, lock, cmd);                \
+ } while (0)
index 0000000000000000000000000000000000000000,0e53f102e68ab33574bf4f8c9304f404a809707a..89c497c630b4ee60839571aabf59c9ad553fb229
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,177 +1,207 @@@
 - * update_interval_end  -  recompute end of @node
++#include <asm/bug.h>
++#include <linux/rbtree_augmented.h>
+ #include "drbd_interval.h"
+ /**
+  * interval_end  -  return end of @node
+  */
+ static inline
+ sector_t interval_end(struct rb_node *node)
+ {
+       struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
+       return this->end;
+ }
+ /**
 -static void
 -update_interval_end(struct rb_node *node, void *__unused)
++ * compute_subtree_last  -  compute end of @node
+  *
+  * The end of an interval is the highest (start + (size >> 9)) value of this
+  * node and of its children.  Called for @node and its parents whenever the end
+  * may have changed.
+  */
 -      struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
 -      sector_t end;
++static inline sector_t
++compute_subtree_last(struct drbd_interval *node)
+ {
 -      end = this->sector + (this->size >> 9);
 -      if (node->rb_left) {
 -              sector_t left = interval_end(node->rb_left);
 -              if (left > end)
 -                      end = left;
++      sector_t max = node->sector + (node->size >> 9);
 -      if (node->rb_right) {
 -              sector_t right = interval_end(node->rb_right);
 -              if (right > end)
 -                      end = right;
++      if (node->rb.rb_left) {
++              sector_t left = interval_end(node->rb.rb_left);
++              if (left > max)
++                      max = left;
++      }
++      if (node->rb.rb_right) {
++              sector_t right = interval_end(node->rb.rb_right);
++              if (right > max)
++                      max = right;
+       }
 -      this->end = end;
++      return max;
++}
++
++static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
++{
++      while (rb != stop) {
++              struct drbd_interval *node = rb_entry(rb, struct drbd_interval, rb);
++              sector_t subtree_last = compute_subtree_last(node);
++              if (node->end == subtree_last)
++                      break;
++              node->end = subtree_last;
++              rb = rb_parent(&node->rb);
+       }
 -      rb_insert_color(&this->rb, root);
 -      rb_augment_insert(&this->rb, update_interval_end, NULL);
+ }
++static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
++{
++      struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
++      struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
++
++      new->end = old->end;
++}
++
++static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
++{
++      struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
++      struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
++
++      new->end = old->end;
++      old->end = compute_subtree_last(old);
++}
++
++static const struct rb_augment_callbacks augment_callbacks = {
++      augment_propagate,
++      augment_copy,
++      augment_rotate,
++};
++
+ /**
+  * drbd_insert_interval  -  insert a new interval into a tree
+  */
+ bool
+ drbd_insert_interval(struct rb_root *root, struct drbd_interval *this)
+ {
+       struct rb_node **new = &root->rb_node, *parent = NULL;
+       BUG_ON(!IS_ALIGNED(this->size, 512));
+       while (*new) {
+               struct drbd_interval *here =
+                       rb_entry(*new, struct drbd_interval, rb);
+               parent = *new;
+               if (this->sector < here->sector)
+                       new = &(*new)->rb_left;
+               else if (this->sector > here->sector)
+                       new = &(*new)->rb_right;
+               else if (this < here)
+                       new = &(*new)->rb_left;
+               else if (this > here)
+                       new = &(*new)->rb_right;
+               else
+                       return false;
+       }
+       rb_link_node(&this->rb, parent, new);
 -      struct rb_node *deepest;
 -
 -      deepest = rb_augment_erase_begin(&this->rb);
 -      rb_erase(&this->rb, root);
 -      rb_augment_erase_end(deepest, update_interval_end, NULL);
++      rb_insert_augmented(&this->rb, root, &augment_callbacks);
+       return true;
+ }
+ /**
+  * drbd_contains_interval  -  check if a tree contains a given interval
+  * @sector:   start sector of @interval
+  * @interval: may not be a valid pointer
+  *
+  * Returns if the tree contains the node @interval with start sector @start.
+  * Does not dereference @interval until @interval is known to be a valid object
+  * in @tree.  Returns %false if @interval is in the tree but with a different
+  * sector number.
+  */
+ bool
+ drbd_contains_interval(struct rb_root *root, sector_t sector,
+                      struct drbd_interval *interval)
+ {
+       struct rb_node *node = root->rb_node;
+       while (node) {
+               struct drbd_interval *here =
+                       rb_entry(node, struct drbd_interval, rb);
+               if (sector < here->sector)
+                       node = node->rb_left;
+               else if (sector > here->sector)
+                       node = node->rb_right;
+               else if (interval < here)
+                       node = node->rb_left;
+               else if (interval > here)
+                       node = node->rb_right;
+               else
+                       return true;
+       }
+       return false;
+ }
+ /**
+  * drbd_remove_interval  -  remove an interval from a tree
+  */
+ void
+ drbd_remove_interval(struct rb_root *root, struct drbd_interval *this)
+ {
++      rb_erase_augmented(&this->rb, root, &augment_callbacks);
+ }
+ /**
+  * drbd_find_overlap  - search for an interval overlapping with [sector, sector + size)
+  * @sector:   start sector
+  * @size:     size, aligned to 512 bytes
+  *
+  * Returns an interval overlapping with [sector, sector + size), or NULL if
+  * there is none.  When there is more than one overlapping interval in the
+  * tree, the interval with the lowest start sector is returned, and all other
+  * overlapping intervals will be on the right side of the tree, reachable with
+  * rb_next().
+  */
+ struct drbd_interval *
+ drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size)
+ {
+       struct rb_node *node = root->rb_node;
+       struct drbd_interval *overlap = NULL;
+       sector_t end = sector + (size >> 9);
+       BUG_ON(!IS_ALIGNED(size, 512));
+       while (node) {
+               struct drbd_interval *here =
+                       rb_entry(node, struct drbd_interval, rb);
+               if (node->rb_left &&
+                   sector < interval_end(node->rb_left)) {
+                       /* Overlap if any must be on left side */
+                       node = node->rb_left;
+               } else if (here->sector < end &&
+                          sector < here->sector + (here->size >> 9)) {
+                       overlap = here;
+                       break;
+               } else if (sector >= here->sector) {
+                       /* Overlap if any must be on right side */
+                       node = node->rb_right;
+               } else
+                       break;
+       }
+       return overlap;
+ }
+ struct drbd_interval *
+ drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size)
+ {
+       sector_t end = sector + (size >> 9);
+       struct rb_node *node;
+       for (;;) {
+               node = rb_next(&i->rb);
+               if (!node)
+                       return NULL;
+               i = rb_entry(node, struct drbd_interval, rb);
+               if (i->sector >= end)
+                       return NULL;
+               if (sector < i->sector + (i->size >> 9))
+                       return i;
+       }
+ }
index 9b833e0fb4409d4dd03ad25bbb7495aae7a928d0,be4f58277124071c950d9b596b4ad9fa0b893ec2..52de26daa1f6c4dceed2115dac702aa431d2d289
  
  #include "drbd_vli.h"
  
- struct after_state_chg_work {
-       struct drbd_work w;
-       union drbd_state os;
-       union drbd_state ns;
-       enum chg_state_flags flags;
-       struct completion *done;
- };
  static DEFINE_MUTEX(drbd_main_mutex);
  int drbdd_init(struct drbd_thread *);
  int drbd_worker(struct drbd_thread *);
@@@ -72,21 -64,17 +64,17 @@@ int drbd_asender(struct drbd_thread *)
  int drbd_init(void);
  static int drbd_open(struct block_device *bdev, fmode_t mode);
  static int drbd_release(struct gendisk *gd, fmode_t mode);
- static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
- static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
-                          union drbd_state ns, enum chg_state_flags flags);
- static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+ static int w_md_sync(struct drbd_work *w, int unused);
  static void md_sync_timer_fn(unsigned long data);
- static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
- static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
- static void _tl_clear(struct drbd_conf *mdev);
+ static int w_bitmap_io(struct drbd_work *w, int unused);
+ static int w_go_diskless(struct drbd_work *w, int unused);
  
  MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
              "Lars Ellenberg <lars@linbit.com>");
  MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
  MODULE_VERSION(REL_VERSION);
  MODULE_LICENSE("GPL");
- MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
+ MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices ("
                 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
  MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
  
@@@ -98,7 -86,6 +86,6 @@@ MODULE_PARM_DESC(allow_oos, "DONT USE!"
  module_param(minor_count, uint, 0444);
  module_param(disable_sendpage, bool, 0644);
  module_param(allow_oos, bool, 0);
- module_param(cn_idx, uint, 0444);
  module_param(proc_details, int, 0644);
  
  #ifdef CONFIG_DRBD_FAULT_INJECTION
@@@ -118,9 -105,8 +105,8 @@@ module_param(fault_devs, int, 0644)
  
  /* module parameter, defined */
  unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
 -int disable_sendpage;
 -int allow_oos;
 +bool disable_sendpage;
 +bool allow_oos;
- unsigned int cn_idx = CN_IDX_DRBD;
  int proc_details;       /* Detail level in proc drbd*/
  
  /* Module parameter for setting the user mode helper program
@@@ -132,10 -118,11 +118,11 @@@ module_param_string(usermode_helper, us
  /* in 2.6.x, our device mapping and config info contains our virtual gendisks
   * as member "struct gendisk *vdisk;"
   */
- struct drbd_conf **minor_table;
+ struct idr minors;
+ struct list_head drbd_tconns;  /* list of struct drbd_tconn */
  
  struct kmem_cache *drbd_request_cache;
- struct kmem_cache *drbd_ee_cache;     /* epoch entries */
+ struct kmem_cache *drbd_ee_cache;     /* peer requests */
  struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
  struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
  mempool_t *drbd_request_mempool;
@@@ -158,1717 -145,295 +145,295 @@@ DEFINE_RATELIMIT_STATE(drbd_ratelimit_s
  
  static const struct block_device_operations drbd_ops = {
        .owner =   THIS_MODULE,
-       .open =    drbd_open,
-       .release = drbd_release,
- };
- struct bio *bio_alloc_drbd(gfp_t gfp_mask)
- {
-       if (!drbd_md_io_bio_set)
-               return bio_alloc(gfp_mask, 1);
-       return bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
- }
- #ifdef __CHECKER__
- /* When checking with sparse, and this is an inline function, sparse will
-    give tons of false positives. When this is a real functions sparse works.
-  */
- int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
- {
-       int io_allowed;
-       atomic_inc(&mdev->local_cnt);
-       io_allowed = (mdev->state.disk >= mins);
-       if (!io_allowed) {
-               if (atomic_dec_and_test(&mdev->local_cnt))
-                       wake_up(&mdev->misc_wait);
-       }
-       return io_allowed;
- }
- #endif
- /**
-  * DOC: The transfer log
-  *
-  * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
-  * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
-  * of the list. There is always at least one &struct drbd_tl_epoch object.
-  *
-  * Each &struct drbd_tl_epoch has a circular double linked list of requests
-  * attached.
-  */
- static int tl_init(struct drbd_conf *mdev)
- {
-       struct drbd_tl_epoch *b;
-       /* during device minor initialization, we may well use GFP_KERNEL */
-       b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
-       if (!b)
-               return 0;
-       INIT_LIST_HEAD(&b->requests);
-       INIT_LIST_HEAD(&b->w.list);
-       b->next = NULL;
-       b->br_number = 4711;
-       b->n_writes = 0;
-       b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
-       mdev->oldest_tle = b;
-       mdev->newest_tle = b;
-       INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
-       INIT_LIST_HEAD(&mdev->barrier_acked_requests);
-       mdev->tl_hash = NULL;
-       mdev->tl_hash_s = 0;
-       return 1;
- }
- static void tl_cleanup(struct drbd_conf *mdev)
- {
-       D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
-       D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
-       kfree(mdev->oldest_tle);
-       mdev->oldest_tle = NULL;
-       kfree(mdev->unused_spare_tle);
-       mdev->unused_spare_tle = NULL;
-       kfree(mdev->tl_hash);
-       mdev->tl_hash = NULL;
-       mdev->tl_hash_s = 0;
- }
- /**
-  * _tl_add_barrier() - Adds a barrier to the transfer log
-  * @mdev:     DRBD device.
-  * @new:      Barrier to be added before the current head of the TL.
-  *
-  * The caller must hold the req_lock.
-  */
- void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
- {
-       struct drbd_tl_epoch *newest_before;
-       INIT_LIST_HEAD(&new->requests);
-       INIT_LIST_HEAD(&new->w.list);
-       new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
-       new->next = NULL;
-       new->n_writes = 0;
-       newest_before = mdev->newest_tle;
-       new->br_number = newest_before->br_number+1;
-       if (mdev->newest_tle != new) {
-               mdev->newest_tle->next = new;
-               mdev->newest_tle = new;
-       }
- }
- /**
-  * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
-  * @mdev:     DRBD device.
-  * @barrier_nr:       Expected identifier of the DRBD write barrier packet.
-  * @set_size: Expected number of requests before that barrier.
-  *
-  * In case the passed barrier_nr or set_size does not match the oldest
-  * &struct drbd_tl_epoch objects this function will cause a termination
-  * of the connection.
-  */
- void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
-                      unsigned int set_size)
- {
-       struct drbd_tl_epoch *b, *nob; /* next old barrier */
-       struct list_head *le, *tle;
-       struct drbd_request *r;
-       spin_lock_irq(&mdev->req_lock);
-       b = mdev->oldest_tle;
-       /* first some paranoia code */
-       if (b == NULL) {
-               dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
-                       barrier_nr);
-               goto bail;
-       }
-       if (b->br_number != barrier_nr) {
-               dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
-                       barrier_nr, b->br_number);
-               goto bail;
-       }
-       if (b->n_writes != set_size) {
-               dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
-                       barrier_nr, set_size, b->n_writes);
-               goto bail;
-       }
-       /* Clean up list of requests processed during current epoch */
-       list_for_each_safe(le, tle, &b->requests) {
-               r = list_entry(le, struct drbd_request, tl_requests);
-               _req_mod(r, barrier_acked);
-       }
-       /* There could be requests on the list waiting for completion
-          of the write to the local disk. To avoid corruptions of
-          slab's data structures we have to remove the lists head.
-          Also there could have been a barrier ack out of sequence, overtaking
-          the write acks - which would be a bug and violating write ordering.
-          To not deadlock in case we lose connection while such requests are
-          still pending, we need some way to find them for the
-          _req_mode(connection_lost_while_pending).
-          These have been list_move'd to the out_of_sequence_requests list in
-          _req_mod(, barrier_acked) above.
-          */
-       list_splice_init(&b->requests, &mdev->barrier_acked_requests);
-       nob = b->next;
-       if (drbd_test_and_clear_flag(mdev, CREATE_BARRIER)) {
-               _tl_add_barrier(mdev, b);
-               if (nob)
-                       mdev->oldest_tle = nob;
-               /* if nob == NULL b was the only barrier, and becomes the new
-                  barrier. Therefore mdev->oldest_tle points already to b */
-       } else {
-               D_ASSERT(nob != NULL);
-               mdev->oldest_tle = nob;
-               kfree(b);
-       }
-       spin_unlock_irq(&mdev->req_lock);
-       dec_ap_pending(mdev);
-       return;
- bail:
-       spin_unlock_irq(&mdev->req_lock);
-       drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- }
- /**
-  * _tl_restart() - Walks the transfer log, and applies an action to all requests
-  * @mdev:     DRBD device.
-  * @what:       The action/event to perform with all request objects
-  *
-  * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
-  * restart_frozen_disk_io.
-  */
- static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
- {
-       struct drbd_tl_epoch *b, *tmp, **pn;
-       struct list_head *le, *tle, carry_reads;
-       struct drbd_request *req;
-       int rv, n_writes, n_reads;
-       b = mdev->oldest_tle;
-       pn = &mdev->oldest_tle;
-       while (b) {
-               n_writes = 0;
-               n_reads = 0;
-               INIT_LIST_HEAD(&carry_reads);
-               list_for_each_safe(le, tle, &b->requests) {
-                       req = list_entry(le, struct drbd_request, tl_requests);
-                       rv = _req_mod(req, what);
-                       n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
-                       n_reads  += (rv & MR_READ) >> MR_READ_SHIFT;
-               }
-               tmp = b->next;
-               if (n_writes) {
-                       if (what == resend) {
-                               b->n_writes = n_writes;
-                               if (b->w.cb == NULL) {
-                                       b->w.cb = w_send_barrier;
-                                       inc_ap_pending(mdev);
-                                       drbd_set_flag(mdev, CREATE_BARRIER);
-                               }
-                               drbd_queue_work(&mdev->data.work, &b->w);
-                       }
-                       pn = &b->next;
-               } else {
-                       if (n_reads)
-                               list_add(&carry_reads, &b->requests);
-                       /* there could still be requests on that ring list,
-                        * in case local io is still pending */
-                       list_del(&b->requests);
-                       /* dec_ap_pending corresponding to queue_barrier.
-                        * the newest barrier may not have been queued yet,
-                        * in which case w.cb is still NULL. */
-                       if (b->w.cb != NULL)
-                               dec_ap_pending(mdev);
-                       if (b == mdev->newest_tle) {
-                               /* recycle, but reinit! */
-                               D_ASSERT(tmp == NULL);
-                               INIT_LIST_HEAD(&b->requests);
-                               list_splice(&carry_reads, &b->requests);
-                               INIT_LIST_HEAD(&b->w.list);
-                               b->w.cb = NULL;
-                               b->br_number = net_random();
-                               b->n_writes = 0;
-                               *pn = b;
-                               break;
-                       }
-                       *pn = tmp;
-                       kfree(b);
-               }
-               b = tmp;
-               list_splice(&carry_reads, &b->requests);
-       }
-       /* Actions operating on the disk state, also want to work on
-          requests that got barrier acked. */
-       list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
-               req = list_entry(le, struct drbd_request, tl_requests);
-               _req_mod(req, what);
-       }
- }
- /**
-  * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
-  * @mdev:     DRBD device.
-  *
-  * This is called after the connection to the peer was lost. The storage covered
-  * by the requests on the transfer gets marked as our of sync. Called from the
-  * receiver thread and the worker thread.
-  */
- void tl_clear(struct drbd_conf *mdev)
- {
-       spin_lock_irq(&mdev->req_lock);
-       _tl_clear(mdev);
-       spin_unlock_irq(&mdev->req_lock);
- }
- static void _tl_clear(struct drbd_conf *mdev)
- {
-       struct list_head *le, *tle;
-       struct drbd_request *r;
-       _tl_restart(mdev, connection_lost_while_pending);
-       /* we expect this list to be empty. */
-       D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
-       /* but just in case, clean it up anyways! */
-       list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
-               r = list_entry(le, struct drbd_request, tl_requests);
-               /* It would be nice to complete outside of spinlock.
-                * But this is easier for now. */
-               _req_mod(r, connection_lost_while_pending);
-       }
-       /* ensure bit indicating barrier is required is clear */
-       drbd_clear_flag(mdev, CREATE_BARRIER);
-       memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
- }
- void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
- {
-       spin_lock_irq(&mdev->req_lock);
-       _tl_restart(mdev, what);
-       spin_unlock_irq(&mdev->req_lock);
- }
- /**
-  * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
-  * @mdev:     DRBD device.
-  */
- void tl_abort_disk_io(struct drbd_conf *mdev)
- {
-       struct drbd_tl_epoch *b;
-       struct list_head *le, *tle;
-       struct drbd_request *req;
-       spin_lock_irq(&mdev->req_lock);
-       b = mdev->oldest_tle;
-       while (b) {
-               list_for_each_safe(le, tle, &b->requests) {
-                       req = list_entry(le, struct drbd_request, tl_requests);
-                       if (!(req->rq_state & RQ_LOCAL_PENDING))
-                               continue;
-                       _req_mod(req, abort_disk_io);
-               }
-               b = b->next;
-       }
-       list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
-               req = list_entry(le, struct drbd_request, tl_requests);
-               if (!(req->rq_state & RQ_LOCAL_PENDING))
-                       continue;
-               _req_mod(req, abort_disk_io);
-       }
-       spin_unlock_irq(&mdev->req_lock);
- }
- /**
-  * cl_wide_st_chg() - true if the state change is a cluster wide one
-  * @mdev:     DRBD device.
-  * @os:               old (current) state.
-  * @ns:               new (wanted) state.
-  */
- static int cl_wide_st_chg(struct drbd_conf *mdev,
-                         union drbd_state os, union drbd_state ns)
- {
-       return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
-                ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
-                 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
-                 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
-                 (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
-               (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
-               (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
- }
- enum drbd_state_rv
- drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
-                 union drbd_state mask, union drbd_state val)
- {
-       unsigned long flags;
-       union drbd_state os, ns;
-       enum drbd_state_rv rv;
-       spin_lock_irqsave(&mdev->req_lock, flags);
-       os = mdev->state;
-       ns.i = (os.i & ~mask.i) | val.i;
-       rv = _drbd_set_state(mdev, ns, f, NULL);
-       ns = mdev->state;
-       spin_unlock_irqrestore(&mdev->req_lock, flags);
-       return rv;
- }
- /**
-  * drbd_force_state() - Impose a change which happens outside our control on our state
-  * @mdev:     DRBD device.
-  * @mask:     mask of state bits to change.
-  * @val:      value of new state bits.
-  */
- void drbd_force_state(struct drbd_conf *mdev,
-       union drbd_state mask, union drbd_state val)
- {
-       drbd_change_state(mdev, CS_HARD, mask, val);
- }
- static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
- static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
-                                                   union drbd_state,
-                                                   union drbd_state);
- enum sanitize_state_warnings {
-       NO_WARNING,
-       ABORTED_ONLINE_VERIFY,
-       ABORTED_RESYNC,
-       CONNECTION_LOST_NEGOTIATING,
-       IMPLICITLY_UPGRADED_DISK,
-       IMPLICITLY_UPGRADED_PDSK,
- };
- static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
-                                      union drbd_state ns, enum sanitize_state_warnings *warn);
- int drbd_send_state_req(struct drbd_conf *,
-                       union drbd_state, union drbd_state);
- static enum drbd_state_rv
- _req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
-            union drbd_state val)
- {
-       union drbd_state os, ns;
-       unsigned long flags;
-       enum drbd_state_rv rv;
-       if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_SUCCESS))
-               return SS_CW_SUCCESS;
-       if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_FAIL))
-               return SS_CW_FAILED_BY_PEER;
-       rv = 0;
-       spin_lock_irqsave(&mdev->req_lock, flags);
-       os = mdev->state;
-       ns.i = (os.i & ~mask.i) | val.i;
-       ns = sanitize_state(mdev, os, ns, NULL);
-       if (!cl_wide_st_chg(mdev, os, ns))
-               rv = SS_CW_NO_NEED;
-       if (!rv) {
-               rv = is_valid_state(mdev, ns);
-               if (rv == SS_SUCCESS) {
-                       rv = is_valid_state_transition(mdev, ns, os);
-                       if (rv == SS_SUCCESS)
-                               rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
-               }
-       }
-       spin_unlock_irqrestore(&mdev->req_lock, flags);
-       return rv;
- }
- /**
-  * drbd_req_state() - Perform an eventually cluster wide state change
-  * @mdev:     DRBD device.
-  * @mask:     mask of state bits to change.
-  * @val:      value of new state bits.
-  * @f:                flags
-  *
-  * Should not be called directly, use drbd_request_state() or
-  * _drbd_request_state().
-  */
- static enum drbd_state_rv
- drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
-              union drbd_state val, enum chg_state_flags f)
- {
-       struct completion done;
-       unsigned long flags;
-       union drbd_state os, ns;
-       enum drbd_state_rv rv;
-       init_completion(&done);
-       if (f & CS_SERIALIZE)
-               mutex_lock(&mdev->state_mutex);
-       spin_lock_irqsave(&mdev->req_lock, flags);
-       os = mdev->state;
-       ns.i = (os.i & ~mask.i) | val.i;
-       ns = sanitize_state(mdev, os, ns, NULL);
-       if (cl_wide_st_chg(mdev, os, ns)) {
-               rv = is_valid_state(mdev, ns);
-               if (rv == SS_SUCCESS)
-                       rv = is_valid_state_transition(mdev, ns, os);
-               spin_unlock_irqrestore(&mdev->req_lock, flags);
-               if (rv < SS_SUCCESS) {
-                       if (f & CS_VERBOSE)
-                               print_st_err(mdev, os, ns, rv);
-                       goto abort;
-               }
-               drbd_state_lock(mdev);
-               if (!drbd_send_state_req(mdev, mask, val)) {
-                       drbd_state_unlock(mdev);
-                       rv = SS_CW_FAILED_BY_PEER;
-                       if (f & CS_VERBOSE)
-                               print_st_err(mdev, os, ns, rv);
-                       goto abort;
-               }
-               if (mask.conn == C_MASK && val.conn == C_DISCONNECTING)
-                       drbd_set_flag(mdev, DISCONNECT_SENT);
-               wait_event(mdev->state_wait,
-                       (rv = _req_st_cond(mdev, mask, val)));
-               if (rv < SS_SUCCESS) {
-                       drbd_state_unlock(mdev);
-                       if (f & CS_VERBOSE)
-                               print_st_err(mdev, os, ns, rv);
-                       goto abort;
-               }
-               spin_lock_irqsave(&mdev->req_lock, flags);
-               os = mdev->state;
-               ns.i = (os.i & ~mask.i) | val.i;
-               rv = _drbd_set_state(mdev, ns, f, &done);
-               drbd_state_unlock(mdev);
-       } else {
-               rv = _drbd_set_state(mdev, ns, f, &done);
-       }
-       spin_unlock_irqrestore(&mdev->req_lock, flags);
-       if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
-               D_ASSERT(current != mdev->worker.task);
-               wait_for_completion(&done);
-       }
- abort:
-       if (f & CS_SERIALIZE)
-               mutex_unlock(&mdev->state_mutex);
-       return rv;
- }
- /**
-  * _drbd_request_state() - Request a state change (with flags)
-  * @mdev:     DRBD device.
-  * @mask:     mask of state bits to change.
-  * @val:      value of new state bits.
-  * @f:                flags
-  *
-  * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
-  * flag, or when logging of failed state change requests is not desired.
-  */
- enum drbd_state_rv
- _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
-                   union drbd_state val, enum chg_state_flags f)
- {
-       enum drbd_state_rv rv;
-       wait_event(mdev->state_wait,
-                  (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
-       return rv;
- }
- static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
- {
-       dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
-           name,
-           drbd_conn_str(ns.conn),
-           drbd_role_str(ns.role),
-           drbd_role_str(ns.peer),
-           drbd_disk_str(ns.disk),
-           drbd_disk_str(ns.pdsk),
-           is_susp(ns) ? 's' : 'r',
-           ns.aftr_isp ? 'a' : '-',
-           ns.peer_isp ? 'p' : '-',
-           ns.user_isp ? 'u' : '-'
-           );
- }
- void print_st_err(struct drbd_conf *mdev, union drbd_state os,
-                 union drbd_state ns, enum drbd_state_rv err)
- {
-       if (err == SS_IN_TRANSIENT_STATE)
-               return;
-       dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
-       print_st(mdev, " state", os);
-       print_st(mdev, "wanted", ns);
- }
- /**
-  * is_valid_state() - Returns an SS_ error code if ns is not valid
-  * @mdev:     DRBD device.
-  * @ns:               State to consider.
-  */
- static enum drbd_state_rv
- is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
- {
-       /* See drbd_state_sw_errors in drbd_strings.c */
-       enum drbd_fencing_p fp;
-       enum drbd_state_rv rv = SS_SUCCESS;
-       fp = FP_DONT_CARE;
-       if (get_ldev(mdev)) {
-               fp = mdev->ldev->dc.fencing;
-               put_ldev(mdev);
-       }
-       if (get_net_conf(mdev)) {
-               if (!mdev->net_conf->two_primaries &&
-                   ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
-                       rv = SS_TWO_PRIMARIES;
-               put_net_conf(mdev);
-       }
-       if (rv <= 0)
-               /* already found a reason to abort */;
-       else if (ns.role == R_SECONDARY && mdev->open_cnt)
-               rv = SS_DEVICE_IN_USE;
-       else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
-               rv = SS_NO_UP_TO_DATE_DISK;
-       else if (fp >= FP_RESOURCE &&
-                ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
-               rv = SS_PRIMARY_NOP;
-       else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
-               rv = SS_NO_UP_TO_DATE_DISK;
-       else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
-               rv = SS_NO_LOCAL_DISK;
-       else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
-               rv = SS_NO_REMOTE_DISK;
-       else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
-               rv = SS_NO_UP_TO_DATE_DISK;
-       else if ((ns.conn == C_CONNECTED ||
-                 ns.conn == C_WF_BITMAP_S ||
-                 ns.conn == C_SYNC_SOURCE ||
-                 ns.conn == C_PAUSED_SYNC_S) &&
-                 ns.disk == D_OUTDATED)
-               rv = SS_CONNECTED_OUTDATES;
-       else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
-                (mdev->sync_conf.verify_alg[0] == 0))
-               rv = SS_NO_VERIFY_ALG;
-       else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
-                 mdev->agreed_pro_version < 88)
-               rv = SS_NOT_SUPPORTED;
-       else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
-               rv = SS_CONNECTED_OUTDATES;
-       return rv;
- }
- /**
-  * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
-  * @mdev:     DRBD device.
-  * @ns:               new state.
-  * @os:               old state.
-  */
- static enum drbd_state_rv
- is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
-                         union drbd_state os)
- {
-       enum drbd_state_rv rv = SS_SUCCESS;
-       if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
-           os.conn > C_CONNECTED)
-               rv = SS_RESYNC_RUNNING;
-       if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
-               rv = SS_ALREADY_STANDALONE;
-       if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
-               rv = SS_IS_DISKLESS;
-       if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
-               rv = SS_NO_NET_CONFIG;
-       if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
-               rv = SS_LOWER_THAN_OUTDATED;
-       if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
-               rv = SS_IN_TRANSIENT_STATE;
-       if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
-               rv = SS_IN_TRANSIENT_STATE;
-       /* While establishing a connection only allow cstate to change.
-          Delay/refuse role changes, detach attach etc... */
-       if (drbd_test_flag(mdev, STATE_SENT) &&
-           !(os.conn == C_WF_REPORT_PARAMS ||
-             (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
-               rv = SS_IN_TRANSIENT_STATE;
-       if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
-               rv = SS_NEED_CONNECTION;
-       if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
-           ns.conn != os.conn && os.conn > C_CONNECTED)
-               rv = SS_RESYNC_RUNNING;
-       if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
-           os.conn < C_CONNECTED)
-               rv = SS_NEED_CONNECTION;
-       if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
-           && os.conn < C_WF_REPORT_PARAMS)
-               rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
-       return rv;
- }
- static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
- {
-       static const char *msg_table[] = {
-               [NO_WARNING] = "",
-               [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
-               [ABORTED_RESYNC] = "Resync aborted.",
-               [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
-               [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
-               [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
-       };
-       if (warn != NO_WARNING)
-               dev_warn(DEV, "%s\n", msg_table[warn]);
- }
- /**
-  * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
-  * @mdev:     DRBD device.
-  * @os:               old state.
-  * @ns:               new state.
-  * @warn_sync_abort:
-  *
-  * When we loose connection, we have to set the state of the peers disk (pdsk)
-  * to D_UNKNOWN. This rule and many more along those lines are in this function.
-  */
- static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
-                                      union drbd_state ns, enum sanitize_state_warnings *warn)
- {
-       enum drbd_fencing_p fp;
-       enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
-       if (warn)
-               *warn = NO_WARNING;
-       fp = FP_DONT_CARE;
-       if (get_ldev(mdev)) {
-               fp = mdev->ldev->dc.fencing;
-               put_ldev(mdev);
-       }
-       /* Disallow Network errors to configure a device's network part */
-       if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
-           os.conn <= C_DISCONNECTING)
-               ns.conn = os.conn;
-       /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
-        * If you try to go into some Sync* state, that shall fail (elsewhere). */
-       if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
-           ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
-               ns.conn = os.conn;
-       /* we cannot fail (again) if we already detached */
-       if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
-               ns.disk = D_DISKLESS;
-       /* After C_DISCONNECTING only C_STANDALONE may follow */
-       if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
-               ns.conn = os.conn;
-       if (ns.conn < C_CONNECTED) {
-               ns.peer_isp = 0;
-               ns.peer = R_UNKNOWN;
-               if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
-                       ns.pdsk = D_UNKNOWN;
-       }
-       /* Clear the aftr_isp when becoming unconfigured */
-       if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
-               ns.aftr_isp = 0;
-       /* Abort resync if a disk fails/detaches */
-       if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
-           (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
-               if (warn)
-                       *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
-                               ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
-               ns.conn = C_CONNECTED;
-       }
-       /* Connection breaks down before we finished "Negotiating" */
-       if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
-           get_ldev_if_state(mdev, D_NEGOTIATING)) {
-               if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
-                       ns.disk = mdev->new_state_tmp.disk;
-                       ns.pdsk = mdev->new_state_tmp.pdsk;
-               } else {
-                       if (warn)
-                               *warn = CONNECTION_LOST_NEGOTIATING;
-                       ns.disk = D_DISKLESS;
-                       ns.pdsk = D_UNKNOWN;
-               }
-               put_ldev(mdev);
-       }
-       /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
-       if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
-               if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
-                       ns.disk = D_UP_TO_DATE;
-               if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
-                       ns.pdsk = D_UP_TO_DATE;
-       }
-       /* Implications of the connection stat on the disk states */
-       disk_min = D_DISKLESS;
-       disk_max = D_UP_TO_DATE;
-       pdsk_min = D_INCONSISTENT;
-       pdsk_max = D_UNKNOWN;
-       switch ((enum drbd_conns)ns.conn) {
-       case C_WF_BITMAP_T:
-       case C_PAUSED_SYNC_T:
-       case C_STARTING_SYNC_T:
-       case C_WF_SYNC_UUID:
-       case C_BEHIND:
-               disk_min = D_INCONSISTENT;
-               disk_max = D_OUTDATED;
-               pdsk_min = D_UP_TO_DATE;
-               pdsk_max = D_UP_TO_DATE;
-               break;
-       case C_VERIFY_S:
-       case C_VERIFY_T:
-               disk_min = D_UP_TO_DATE;
-               disk_max = D_UP_TO_DATE;
-               pdsk_min = D_UP_TO_DATE;
-               pdsk_max = D_UP_TO_DATE;
-               break;
-       case C_CONNECTED:
-               disk_min = D_DISKLESS;
-               disk_max = D_UP_TO_DATE;
-               pdsk_min = D_DISKLESS;
-               pdsk_max = D_UP_TO_DATE;
-               break;
-       case C_WF_BITMAP_S:
-       case C_PAUSED_SYNC_S:
-       case C_STARTING_SYNC_S:
-       case C_AHEAD:
-               disk_min = D_UP_TO_DATE;
-               disk_max = D_UP_TO_DATE;
-               pdsk_min = D_INCONSISTENT;
-               pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
-               break;
-       case C_SYNC_TARGET:
-               disk_min = D_INCONSISTENT;
-               disk_max = D_INCONSISTENT;
-               pdsk_min = D_UP_TO_DATE;
-               pdsk_max = D_UP_TO_DATE;
-               break;
-       case C_SYNC_SOURCE:
-               disk_min = D_UP_TO_DATE;
-               disk_max = D_UP_TO_DATE;
-               pdsk_min = D_INCONSISTENT;
-               pdsk_max = D_INCONSISTENT;
-               break;
-       case C_STANDALONE:
-       case C_DISCONNECTING:
-       case C_UNCONNECTED:
-       case C_TIMEOUT:
-       case C_BROKEN_PIPE:
-       case C_NETWORK_FAILURE:
-       case C_PROTOCOL_ERROR:
-       case C_TEAR_DOWN:
-       case C_WF_CONNECTION:
-       case C_WF_REPORT_PARAMS:
-       case C_MASK:
-               break;
-       }
-       if (ns.disk > disk_max)
-               ns.disk = disk_max;
-       if (ns.disk < disk_min) {
-               if (warn)
-                       *warn = IMPLICITLY_UPGRADED_DISK;
-               ns.disk = disk_min;
-       }
-       if (ns.pdsk > pdsk_max)
-               ns.pdsk = pdsk_max;
-       if (ns.pdsk < pdsk_min) {
-               if (warn)
-                       *warn = IMPLICITLY_UPGRADED_PDSK;
-               ns.pdsk = pdsk_min;
-       }
-       if (fp == FP_STONITH &&
-           (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
-           !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
-               ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
-       if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
-           (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
-           !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
-               ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
-       if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
-               if (ns.conn == C_SYNC_SOURCE)
-                       ns.conn = C_PAUSED_SYNC_S;
-               if (ns.conn == C_SYNC_TARGET)
-                       ns.conn = C_PAUSED_SYNC_T;
-       } else {
-               if (ns.conn == C_PAUSED_SYNC_S)
-                       ns.conn = C_SYNC_SOURCE;
-               if (ns.conn == C_PAUSED_SYNC_T)
-                       ns.conn = C_SYNC_TARGET;
-       }
-       return ns;
- }
- /* helper for __drbd_set_state */
- static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
- {
-       if (mdev->agreed_pro_version < 90)
-               mdev->ov_start_sector = 0;
-       mdev->rs_total = drbd_bm_bits(mdev);
-       mdev->ov_position = 0;
-       if (cs == C_VERIFY_T) {
-               /* starting online verify from an arbitrary position
-                * does not fit well into the existing protocol.
-                * on C_VERIFY_T, we initialize ov_left and friends
-                * implicitly in receive_DataRequest once the
-                * first P_OV_REQUEST is received */
-               mdev->ov_start_sector = ~(sector_t)0;
-       } else {
-               unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
-               if (bit >= mdev->rs_total) {
-                       mdev->ov_start_sector =
-                               BM_BIT_TO_SECT(mdev->rs_total - 1);
-                       mdev->rs_total = 1;
-               } else
-                       mdev->rs_total -= bit;
-               mdev->ov_position = mdev->ov_start_sector;
-       }
-       mdev->ov_left = mdev->rs_total;
- }
- static void drbd_resume_al(struct drbd_conf *mdev)
- {
-       if (drbd_test_and_clear_flag(mdev, AL_SUSPENDED))
-               dev_info(DEV, "Resumed AL updates\n");
- }
- /**
-  * __drbd_set_state() - Set a new DRBD state
-  * @mdev:     DRBD device.
-  * @ns:               new state.
-  * @flags:    Flags
-  * @done:     Optional completion, that will get completed after the after_state_ch() finished
-  *
-  * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
-  */
- enum drbd_state_rv
- __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
-                enum chg_state_flags flags, struct completion *done)
- {
-       union drbd_state os;
-       enum drbd_state_rv rv = SS_SUCCESS;
-       enum sanitize_state_warnings ssw;
-       struct after_state_chg_work *ascw;
-       os = mdev->state;
-       ns = sanitize_state(mdev, os, ns, &ssw);
-       if (ns.i == os.i)
-               return SS_NOTHING_TO_DO;
-       if (!(flags & CS_HARD)) {
-               /*  pre-state-change checks ; only look at ns  */
-               /* See drbd_state_sw_errors in drbd_strings.c */
-               rv = is_valid_state(mdev, ns);
-               if (rv < SS_SUCCESS) {
-                       /* If the old state was illegal as well, then let
-                          this happen...*/
-                       if (is_valid_state(mdev, os) == rv)
-                               rv = is_valid_state_transition(mdev, ns, os);
-               } else
-                       rv = is_valid_state_transition(mdev, ns, os);
-       }
-       if (rv < SS_SUCCESS) {
-               if (flags & CS_VERBOSE)
-                       print_st_err(mdev, os, ns, rv);
-               return rv;
-       }
-       print_sanitize_warnings(mdev, ssw);
-       {
-       char *pbp, pb[300];
-       pbp = pb;
-       *pbp = 0;
-       if (ns.role != os.role)
-               pbp += sprintf(pbp, "role( %s -> %s ) ",
-                              drbd_role_str(os.role),
-                              drbd_role_str(ns.role));
-       if (ns.peer != os.peer)
-               pbp += sprintf(pbp, "peer( %s -> %s ) ",
-                              drbd_role_str(os.peer),
-                              drbd_role_str(ns.peer));
-       if (ns.conn != os.conn)
-               pbp += sprintf(pbp, "conn( %s -> %s ) ",
-                              drbd_conn_str(os.conn),
-                              drbd_conn_str(ns.conn));
-       if (ns.disk != os.disk)
-               pbp += sprintf(pbp, "disk( %s -> %s ) ",
-                              drbd_disk_str(os.disk),
-                              drbd_disk_str(ns.disk));
-       if (ns.pdsk != os.pdsk)
-               pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
-                              drbd_disk_str(os.pdsk),
-                              drbd_disk_str(ns.pdsk));
-       if (is_susp(ns) != is_susp(os))
-               pbp += sprintf(pbp, "susp( %d -> %d ) ",
-                              is_susp(os),
-                              is_susp(ns));
-       if (ns.aftr_isp != os.aftr_isp)
-               pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
-                              os.aftr_isp,
-                              ns.aftr_isp);
-       if (ns.peer_isp != os.peer_isp)
-               pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
-                              os.peer_isp,
-                              ns.peer_isp);
-       if (ns.user_isp != os.user_isp)
-               pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
-                              os.user_isp,
-                              ns.user_isp);
-       dev_info(DEV, "%s\n", pb);
-       }
-       /* solve the race between becoming unconfigured,
-        * worker doing the cleanup, and
-        * admin reconfiguring us:
-        * on (re)configure, first set CONFIG_PENDING,
-        * then wait for a potentially exiting worker,
-        * start the worker, and schedule one no_op.
-        * then proceed with configuration.
-        */
-       if (ns.disk == D_DISKLESS &&
-           ns.conn == C_STANDALONE &&
-           ns.role == R_SECONDARY &&
-           !drbd_test_and_set_flag(mdev, CONFIG_PENDING))
-               drbd_set_flag(mdev, DEVICE_DYING);
-       /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
-        * on the ldev here, to be sure the transition -> D_DISKLESS resp.
-        * drbd_ldev_destroy() won't happen before our corresponding
-        * after_state_ch works run, where we put_ldev again. */
-       if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
-           (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
-               atomic_inc(&mdev->local_cnt);
-       mdev->state = ns;
-       if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
-               drbd_print_uuids(mdev, "attached to UUIDs");
-       wake_up(&mdev->misc_wait);
-       wake_up(&mdev->state_wait);
-       /* Aborted verify run, or we reached the stop sector.
-        * Log the last position, unless end-of-device. */
-       if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
-           ns.conn <= C_CONNECTED) {
-               mdev->ov_start_sector =
-                       BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
-               if (mdev->ov_left)
-                       dev_info(DEV, "Online Verify reached sector %llu\n",
-                               (unsigned long long)mdev->ov_start_sector);
-       }
-       if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
-           (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
-               dev_info(DEV, "Syncer continues.\n");
-               mdev->rs_paused += (long)jiffies
-                                 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
-               if (ns.conn == C_SYNC_TARGET)
-                       mod_timer(&mdev->resync_timer, jiffies);
-       }
-       if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
-           (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
-               dev_info(DEV, "Resync suspended\n");
-               mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
-       }
-       if (os.conn == C_CONNECTED &&
-           (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
-               unsigned long now = jiffies;
-               int i;
-               set_ov_position(mdev, ns.conn);
-               mdev->rs_start = now;
-               mdev->rs_last_events = 0;
-               mdev->rs_last_sect_ev = 0;
-               mdev->ov_last_oos_size = 0;
-               mdev->ov_last_oos_start = 0;
-               for (i = 0; i < DRBD_SYNC_MARKS; i++) {
-                       mdev->rs_mark_left[i] = mdev->ov_left;
-                       mdev->rs_mark_time[i] = now;
-               }
-               drbd_rs_controller_reset(mdev);
-               if (ns.conn == C_VERIFY_S) {
-                       dev_info(DEV, "Starting Online Verify from sector %llu\n",
-                                       (unsigned long long)mdev->ov_position);
-                       mod_timer(&mdev->resync_timer, jiffies);
-               }
-       }
-       if (get_ldev(mdev)) {
-               u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
-                                                MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
-                                                MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
-               if (drbd_test_flag(mdev, CRASHED_PRIMARY))
-                       mdf |= MDF_CRASHED_PRIMARY;
-               if (mdev->state.role == R_PRIMARY ||
-                   (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
-                       mdf |= MDF_PRIMARY_IND;
-               if (mdev->state.conn > C_WF_REPORT_PARAMS)
-                       mdf |= MDF_CONNECTED_IND;
-               if (mdev->state.disk > D_INCONSISTENT)
-                       mdf |= MDF_CONSISTENT;
-               if (mdev->state.disk > D_OUTDATED)
-                       mdf |= MDF_WAS_UP_TO_DATE;
-               if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
-                       mdf |= MDF_PEER_OUT_DATED;
-               if (mdf != mdev->ldev->md.flags) {
-                       mdev->ldev->md.flags = mdf;
-                       drbd_md_mark_dirty(mdev);
-               }
-               if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
-                       drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
-               put_ldev(mdev);
-       }
-       /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
-       if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
-           os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
-               drbd_set_flag(mdev, CONSIDER_RESYNC);
-       /* Receiver should clean up itself */
-       if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
-               drbd_thread_stop_nowait(&mdev->receiver);
-       /* Now the receiver finished cleaning up itself, it should die */
-       if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
-               drbd_thread_stop_nowait(&mdev->receiver);
-       /* Upon network failure, we need to restart the receiver. */
-       if (os.conn > C_WF_CONNECTION &&
-           ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
-               drbd_thread_restart_nowait(&mdev->receiver);
-       /* Resume AL writing if we get a connection */
-       if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
-               drbd_resume_al(mdev);
-       /* remember last connect and attach times so request_timer_fn() won't
-        * kill newly established sessions while we are still trying to thaw
-        * previously frozen IO */
-       if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
-               mdev->last_reconnect_jif = jiffies;
-       if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
-           ns.disk > D_NEGOTIATING)
-               mdev->last_reattach_jif = jiffies;
-       ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
-       if (ascw) {
-               ascw->os = os;
-               ascw->ns = ns;
-               ascw->flags = flags;
-               ascw->w.cb = w_after_state_ch;
-               ascw->done = done;
-               drbd_queue_work(&mdev->data.work, &ascw->w);
-       } else {
-               dev_warn(DEV, "Could not kmalloc an ascw\n");
-       }
-       return rv;
- }
- static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
- {
-       struct after_state_chg_work *ascw =
-               container_of(w, struct after_state_chg_work, w);
-       after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
-       if (ascw->flags & CS_WAIT_COMPLETE) {
-               D_ASSERT(ascw->done != NULL);
-               complete(ascw->done);
-       }
-       kfree(ascw);
-       return 1;
- }
- static void abw_start_sync(struct drbd_conf *mdev, int rv)
- {
-       if (rv) {
-               dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
-               _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
-               return;
-       }
-       switch (mdev->state.conn) {
-       case C_STARTING_SYNC_T:
-               _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
-               break;
-       case C_STARTING_SYNC_S:
-               drbd_start_resync(mdev, C_SYNC_SOURCE);
-               break;
-       }
- }
- int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
-               int (*io_fn)(struct drbd_conf *),
-               char *why, enum bm_flag flags)
- {
-       int rv;
-       D_ASSERT(current == mdev->worker.task);
-       /* open coded non-blocking drbd_suspend_io(mdev); */
-       drbd_set_flag(mdev, SUSPEND_IO);
+       .open =    drbd_open,
+       .release = drbd_release,
+ };
  
-       drbd_bm_lock(mdev, why, flags);
-       rv = io_fn(mdev);
-       drbd_bm_unlock(mdev);
 -static void bio_destructor_drbd(struct bio *bio)
 -{
 -      bio_free(bio, drbd_md_io_bio_set);
 -}
 -
+ struct bio *bio_alloc_drbd(gfp_t gfp_mask)
+ {
+       struct bio *bio;
  
-       drbd_resume_io(mdev);
+       if (!drbd_md_io_bio_set)
+               return bio_alloc(gfp_mask, 1);
  
-       return rv;
+       bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+       if (!bio)
+               return NULL;
 -      bio->bi_destructor = bio_destructor_drbd;
+       return bio;
  }
  
- /**
-  * after_state_ch() - Perform after state change actions that may sleep
-  * @mdev:     DRBD device.
-  * @os:               old state.
-  * @ns:               new state.
-  * @flags:    Flags
+ #ifdef __CHECKER__
+ /* When checking with sparse, and this is an inline function, sparse will
+    give tons of false positives. When this is a real functions sparse works.
   */
- static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
-                          union drbd_state ns, enum chg_state_flags flags)
+ int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
  {
-       enum drbd_fencing_p fp;
-       enum drbd_req_event what = nothing;
-       union drbd_state nsm = (union drbd_state){ .i = -1 };
-       if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
-               drbd_clear_flag(mdev, CRASHED_PRIMARY);
-               if (mdev->p_uuid)
-                       mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
-       }
+       int io_allowed;
  
-       fp = FP_DONT_CARE;
-       if (get_ldev(mdev)) {
-               fp = mdev->ldev->dc.fencing;
-               put_ldev(mdev);
+       atomic_inc(&mdev->local_cnt);
+       io_allowed = (mdev->state.disk >= mins);
+       if (!io_allowed) {
+               if (atomic_dec_and_test(&mdev->local_cnt))
+                       wake_up(&mdev->misc_wait);
        }
+       return io_allowed;
+ }
  
-       /* Inform userspace about the change... */
-       drbd_bcast_state(mdev, ns);
-       if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
-           (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
-               drbd_khelper(mdev, "pri-on-incon-degr");
-       /* Here we have the actions that are performed after a
-          state change. This function might sleep */
-       if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
-               mod_timer(&mdev->request_timer, jiffies + HZ);
-       nsm.i = -1;
-       if (ns.susp_nod) {
-               if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
-                       what = resend;
-               if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
-                   ns.disk > D_NEGOTIATING)
-                       what = restart_frozen_disk_io;
-               if (what != nothing)
-                       nsm.susp_nod = 0;
-       }
+ #endif
  
-       if (ns.susp_fen) {
-               /* case1: The outdate peer handler is successful: */
-               if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
-                       if (drbd_test_flag(mdev, NEW_CUR_UUID)) {
-                               drbd_uuid_new_current(mdev);
-                               drbd_clear_flag(mdev, NEW_CUR_UUID);
-                       }
-                       spin_lock_irq(&mdev->req_lock);
-                       _tl_clear(mdev);
-                       _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
-                       spin_unlock_irq(&mdev->req_lock);
-               }
-               /* case2: The connection was established again: */
-               if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
-                       drbd_clear_flag(mdev, NEW_CUR_UUID);
-                       what = resend;
-                       nsm.susp_fen = 0;
+ /**
+  * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch
+  * @tconn:    DRBD connection.
+  * @barrier_nr:       Expected identifier of the DRBD write barrier packet.
+  * @set_size: Expected number of requests before that barrier.
+  *
+  * In case the passed barrier_nr or set_size does not match the oldest
+  * epoch of not yet barrier-acked requests, this function will cause a
+  * termination of the connection.
+  */
+ void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr,
+               unsigned int set_size)
+ {
+       struct drbd_request *r;
+       struct drbd_request *req = NULL;
+       int expect_epoch = 0;
+       int expect_size = 0;
+       spin_lock_irq(&tconn->req_lock);
 -      /* find latest not yet barrier-acked write request,
++      /* find oldest not yet barrier-acked write request,
+        * count writes in its epoch. */
+       list_for_each_entry(r, &tconn->transfer_log, tl_requests) {
+               const unsigned s = r->rq_state;
+               if (!req) {
+                       if (!(s & RQ_WRITE))
+                               continue;
+                       if (!(s & RQ_NET_MASK))
+                               continue;
+                       if (s & RQ_NET_DONE)
+                               continue;
+                       req = r;
+                       expect_epoch = req->epoch;
+                       expect_size ++;
+               } else {
+                       if (r->epoch != expect_epoch)
+                               break;
+                       if (!(s & RQ_WRITE))
+                               continue;
+                       /* if (s & RQ_DONE): not expected */
+                       /* if (!(s & RQ_NET_MASK)): not expected */
+                       expect_size++;
                }
        }
  
-       if (what != nothing) {
-               spin_lock_irq(&mdev->req_lock);
-               _tl_restart(mdev, what);
-               nsm.i &= mdev->state.i;
-               _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
-               spin_unlock_irq(&mdev->req_lock);
+       /* first some paranoia code */
+       if (req == NULL) {
+               conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
+                        barrier_nr);
+               goto bail;
        }
-       /* Became sync source.  With protocol >= 96, we still need to send out
-        * the sync uuid now. Need to do that before any drbd_send_state, or
-        * the other side may go "paused sync" before receiving the sync uuids,
-        * which is unexpected. */
-       if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
-           (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
-           mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
-               drbd_gen_and_send_sync_uuid(mdev);
-               put_ldev(mdev);
+       if (expect_epoch != barrier_nr) {
+               conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n",
+                        barrier_nr, expect_epoch);
+               goto bail;
        }
  
-       /* Do not change the order of the if above and the two below... */
-       if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
-               /* we probably will start a resync soon.
-                * make sure those things are properly reset. */
-               mdev->rs_total = 0;
-               mdev->rs_failed = 0;
-               atomic_set(&mdev->rs_pending_cnt, 0);
-               drbd_rs_cancel_all(mdev);
-               drbd_send_uuids(mdev);
-               drbd_send_state(mdev, ns);
-       }
-       /* No point in queuing send_bitmap if we don't have a connection
-        * anymore, so check also the _current_ state, not only the new state
-        * at the time this work was queued. */
-       if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
-           mdev->state.conn == C_WF_BITMAP_S)
-               drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
-                               "send_bitmap (WFBitMapS)",
-                               BM_LOCKED_TEST_ALLOWED);
-       /* Lost contact to peer's copy of the data */
-       if ((os.pdsk >= D_INCONSISTENT &&
-            os.pdsk != D_UNKNOWN &&
-            os.pdsk != D_OUTDATED)
-       &&  (ns.pdsk < D_INCONSISTENT ||
-            ns.pdsk == D_UNKNOWN ||
-            ns.pdsk == D_OUTDATED)) {
-               if (get_ldev(mdev)) {
-                       if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
-                           mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
-                               if (is_susp(mdev->state)) {
-                                       drbd_set_flag(mdev, NEW_CUR_UUID);
-                               } else {
-                                       drbd_uuid_new_current(mdev);
-                                       drbd_send_uuids(mdev);
-                               }
-                       }
-                       put_ldev(mdev);
-               }
+       if (expect_size != set_size) {
+               conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
+                        barrier_nr, set_size, expect_size);
+               goto bail;
        }
  
-       if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
-               if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
-                   mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
-                       drbd_uuid_new_current(mdev);
-                       drbd_send_uuids(mdev);
-               }
-               /* D_DISKLESS Peer becomes secondary */
-               if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
-                       /* We may still be Primary ourselves.
-                        * No harm done if the bitmap still changes,
-                        * redirtied pages will follow later. */
-                       drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
-                               "demote diskless peer", BM_LOCKED_SET_ALLOWED);
-               put_ldev(mdev);
 -      /* Clean up list of requests processed during current epoch */
 -      list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) {
++      /* Clean up list of requests processed during current epoch. */
++      /* this extra list walk restart is paranoia,
++       * to catch requests being barrier-acked "unexpectedly".
++       * It usually should find the same req again, or some READ preceding it. */
++      list_for_each_entry(req, &tconn->transfer_log, tl_requests)
++              if (req->epoch == expect_epoch)
++                      break;
++      list_for_each_entry_safe_from(req, r, &tconn->transfer_log, tl_requests) {
+               if (req->epoch != expect_epoch)
+                       break;
+               _req_mod(req, BARRIER_ACKED);
        }
+       spin_unlock_irq(&tconn->req_lock);
  
-       /* Write out all changed bits on demote.
-        * Though, no need to da that just yet
-        * if there is a resync going on still */
-       if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
-               mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
-               /* No changes to the bitmap expected this time, so assert that,
-                * even though no harm was done if it did change. */
-               drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
-                               "demote", BM_LOCKED_TEST_ALLOWED);
-               put_ldev(mdev);
-       }
+       return;
  
-       /* Last part of the attaching process ... */
-       if (ns.conn >= C_CONNECTED &&
-           os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
-               drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
-               drbd_send_uuids(mdev);
-               drbd_send_state(mdev, ns);
-       }
+ bail:
+       spin_unlock_irq(&tconn->req_lock);
+       conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+ }
  
-       /* We want to pause/continue resync, tell peer. */
-       if (ns.conn >= C_CONNECTED &&
-            ((os.aftr_isp != ns.aftr_isp) ||
-             (os.user_isp != ns.user_isp)))
-               drbd_send_state(mdev, ns);
-       /* In case one of the isp bits got set, suspend other devices. */
-       if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
-           (ns.aftr_isp || ns.peer_isp || ns.user_isp))
-               suspend_other_sg(mdev);
-       /* Make sure the peer gets informed about eventual state
-          changes (ISP bits) while we were in WFReportParams. */
-       if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
-               drbd_send_state(mdev, ns);
-       if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
-               drbd_send_state(mdev, ns);
-       /* We are in the progress to start a full sync... */
-       if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
-           (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
-               /* no other bitmap changes expected during this phase */
-               drbd_queue_bitmap_io(mdev,
-                       &drbd_bmio_set_n_write, &abw_start_sync,
-                       "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
-       /* We are invalidating our self... */
-       if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
-           os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
-               /* other bitmap operation expected during this phase */
-               drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
-                       "set_n_write from invalidate", BM_LOCKED_MASK);
-       /* first half of local IO error, failure to attach,
-        * or administrative detach */
-       if (os.disk != D_FAILED && ns.disk == D_FAILED) {
-               /* corresponding get_ldev was in __drbd_set_state, to serialize
-                * our cleanup here with the transition to D_DISKLESS.
-                * But it is still not safe to dreference ldev here, we may end
-                * up here from a failed attach, before ldev was even set.  */
-               if (mdev->ldev) {
-                       enum drbd_io_error_p eh = mdev->ldev->dc.on_io_error;
-                       /* In some setups, this handler triggers a suicide,
-                        * basically mapping IO error to node failure, to
-                        * reduce the number of different failure scenarios.
-                        *
-                        * This handler intentionally runs before we abort IO,
-                        * notify the peer, or try to update our meta data. */
-                       if (eh == EP_CALL_HELPER && drbd_test_flag(mdev, WAS_IO_ERROR))
-                               drbd_khelper(mdev, "local-io-error");
-                       /* Immediately allow completion of all application IO,
-                        * that waits for completion from the local disk,
-                        * if this was a force-detach due to disk_timeout
-                        * or administrator request (drbdsetup detach --force).
-                        * Do NOT abort otherwise.
-                        * Aborting local requests may cause serious problems,
-                        * if requests are completed to upper layers already,
-                        * and then later the already submitted local bio completes.
-                        * This can cause DMA into former bio pages that meanwhile
-                        * have been re-used for other things.
-                        * So aborting local requests may cause crashes,
-                        * or even worse, silent data corruption.
-                        */
-                       if (drbd_test_flag(mdev, FORCE_DETACH))
-                               tl_abort_disk_io(mdev);
-                       /* current state still has to be D_FAILED,
-                        * there is only one way out: to D_DISKLESS,
-                        * and that may only happen after our put_ldev below. */
-                       if (mdev->state.disk != D_FAILED)
-                               dev_err(DEV,
-                                       "ASSERT FAILED: disk is %s during detach\n",
-                                       drbd_disk_str(mdev->state.disk));
-                       if (ns.conn >= C_CONNECTED)
-                               drbd_send_state(mdev, ns);
-                       drbd_rs_cancel_all(mdev);
-                       /* In case we want to get something to stable storage still,
-                        * this may be the last chance.
-                        * Following put_ldev may transition to D_DISKLESS. */
-                       drbd_md_sync(mdev);
-               }
-               put_ldev(mdev);
-       }
  
-         /* second half of local IO error, failure to attach,
-          * or administrative detach,
-          * after local_cnt references have reached zero again */
-         if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
-                 /* We must still be diskless,
-                  * re-attach has to be serialized with this! */
-                 if (mdev->state.disk != D_DISKLESS)
-                         dev_err(DEV,
-                                 "ASSERT FAILED: disk is %s while going diskless\n",
-                                 drbd_disk_str(mdev->state.disk));
-               if (ns.conn >= C_CONNECTED)
-                       drbd_send_state(mdev, ns);
-               /* corresponding get_ldev in __drbd_set_state
-                * this may finally trigger drbd_ldev_destroy. */
-               put_ldev(mdev);
-       }
+ /**
+  * _tl_restart() - Walks the transfer log, and applies an action to all requests
+  * @mdev:     DRBD device.
+  * @what:       The action/event to perform with all request objects
+  *
+  * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
+  * RESTART_FROZEN_DISK_IO.
+  */
+ /* must hold resource->req_lock */
+ void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
+ {
+       struct drbd_request *req, *r;
  
-       /* Notify peer that I had a local IO error, and did not detached.. */
-       if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
-               drbd_send_state(mdev, ns);
+       list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests)
+               _req_mod(req, what);
+ }
  
-       /* Disks got bigger while they were detached */
-       if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
-           drbd_test_and_clear_flag(mdev, RESYNC_AFTER_NEG)) {
-               if (ns.conn == C_CONNECTED)
-                       resync_after_online_grow(mdev);
      }
+ void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
+ {
+       spin_lock_irq(&tconn->req_lock);
+       _tl_restart(tconn, what);
+       spin_unlock_irq(&tconn->req_lock);
+ }
  
-       /* A resync finished or aborted, wake paused devices... */
-       if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
-           (os.peer_isp && !ns.peer_isp) ||
-           (os.user_isp && !ns.user_isp))
-               resume_next_sg(mdev);
-       /* sync target done with resync.  Explicitly notify peer, even though
-        * it should (at least for non-empty resyncs) already know itself. */
-       if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
-               drbd_send_state(mdev, ns);
-       /* Verify finished, or reached stop sector.  Peer did not know about
-        * the stop sector, and we may even have changed the stop sector during
-        * verify to interrupt/stop early.  Send the new state. */
-       if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
-       && mdev->agreed_pro_version >= 97)
-               drbd_send_state(mdev, ns);
-       /* Wake up role changes, that were delayed because of connection establishing */
-       if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
-               drbd_clear_flag(mdev, STATE_SENT);
-               wake_up(&mdev->state_wait);
-       }
+ /**
+  * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
+  * @mdev:     DRBD device.
+  *
+  * This is called after the connection to the peer was lost. The storage covered
+  * by the requests on the transfer gets marked as our of sync. Called from the
+  * receiver thread and the worker thread.
+  */
+ void tl_clear(struct drbd_tconn *tconn)
+ {
+       tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING);
+ }
  
-       /* This triggers bitmap writeout of potentially still unwritten pages
-        * if the resync finished cleanly, or aborted because of peer disk
-        * failure, or because of connection loss.
-        * For resync aborted because of local disk failure, we cannot do
-        * any bitmap writeout anymore.
-        * No harm done if some bits change during this phase.
-        */
-       if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
-               drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
-                       "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
-               put_ldev(mdev);
-       }
+ /**
+  * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
+  * @mdev:     DRBD device.
+  */
+ void tl_abort_disk_io(struct drbd_conf *mdev)
+ {
+       struct drbd_tconn *tconn = mdev->tconn;
+       struct drbd_request *req, *r;
  
-       /* free tl_hash if we Got thawed and are C_STANDALONE */
-       if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
-               drbd_free_tl_hash(mdev);
-       /* Upon network connection, we need to start the receiver */
-       if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
-               drbd_thread_start(&mdev->receiver);
-       /* Terminate worker thread if we are unconfigured - it will be
-          restarted as needed... */
-       if (ns.disk == D_DISKLESS &&
-           ns.conn == C_STANDALONE &&
-           ns.role == R_SECONDARY) {
-               if (os.aftr_isp != ns.aftr_isp)
-                       resume_next_sg(mdev);
-               /* set in __drbd_set_state, unless CONFIG_PENDING was set */
-               if (drbd_test_flag(mdev, DEVICE_DYING))
-                       drbd_thread_stop_nowait(&mdev->worker);
+       spin_lock_irq(&tconn->req_lock);
+       list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) {
+               if (!(req->rq_state & RQ_LOCAL_PENDING))
+                       continue;
+               if (req->w.mdev != mdev)
+                       continue;
+               _req_mod(req, ABORT_DISK_IO);
        }
-       drbd_md_sync(mdev);
+       spin_unlock_irq(&tconn->req_lock);
  }
  
  static int drbd_thread_setup(void *arg)
  {
        struct drbd_thread *thi = (struct drbd_thread *) arg;
-       struct drbd_conf *mdev = thi->mdev;
+       struct drbd_tconn *tconn = thi->tconn;
        unsigned long flags;
        int retval;
  
+       snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s",
+                thi->name[0], thi->tconn->name);
  restart:
        retval = thi->function(thi);
  
        spin_lock_irqsave(&thi->t_lock, flags);
  
-       /* if the receiver has been "Exiting", the last thing it did
+       /* if the receiver has been "EXITING", the last thing it did
         * was set the conn state to "StandAlone",
         * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
         * and receiver thread will be "started".
-        * drbd_thread_start needs to set "Restarting" in that case.
+        * drbd_thread_start needs to set "RESTARTING" in that case.
         * t_state check and assignment needs to be within the same spinlock,
-        * so either thread_start sees Exiting, and can remap to Restarting,
-        * or thread_start see None, and can proceed as normal.
+        * so either thread_start sees EXITING, and can remap to RESTARTING,
+        * or thread_start see NONE, and can proceed as normal.
         */
  
-       if (thi->t_state == Restarting) {
-               dev_info(DEV, "Restarting %s\n", current->comm);
-               thi->t_state = Running;
+       if (thi->t_state == RESTARTING) {
+               conn_info(tconn, "Restarting %s thread\n", thi->name);
+               thi->t_state = RUNNING;
                spin_unlock_irqrestore(&thi->t_lock, flags);
                goto restart;
        }
  
        thi->task = NULL;
-       thi->t_state = None;
+       thi->t_state = NONE;
        smp_mb();
-       complete(&thi->stop);
+       complete_all(&thi->stop);
        spin_unlock_irqrestore(&thi->t_lock, flags);
  
-       dev_info(DEV, "Terminating %s\n", current->comm);
+       conn_info(tconn, "Terminating %s\n", current->comm);
  
        /* Release mod reference taken when thread was started */
+       kref_put(&tconn->kref, &conn_destroy);
        module_put(THIS_MODULE);
        return retval;
  }
  
- static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
-                     int (*func) (struct drbd_thread *))
+ static void drbd_thread_init(struct drbd_tconn *tconn, struct drbd_thread *thi,
+                            int (*func) (struct drbd_thread *), char *name)
  {
        spin_lock_init(&thi->t_lock);
        thi->task    = NULL;
-       thi->t_state = None;
+       thi->t_state = NONE;
        thi->function = func;
-       thi->mdev = mdev;
+       thi->tconn = tconn;
+       strncpy(thi->name, name, ARRAY_SIZE(thi->name));
  }
  
  int drbd_thread_start(struct drbd_thread *thi)
  {
-       struct drbd_conf *mdev = thi->mdev;
+       struct drbd_tconn *tconn = thi->tconn;
        struct task_struct *nt;
        unsigned long flags;
  
-       const char *me =
-               thi == &mdev->receiver ? "receiver" :
-               thi == &mdev->asender  ? "asender"  :
-               thi == &mdev->worker   ? "worker"   : "NONSENSE";
        /* is used from state engine doing drbd_thread_stop_nowait,
         * while holding the req lock irqsave */
        spin_lock_irqsave(&thi->t_lock, flags);
  
        switch (thi->t_state) {
-       case None:
-               dev_info(DEV, "Starting %s thread (from %s [%d])\n",
-                               me, current->comm, current->pid);
+       case NONE:
+               conn_info(tconn, "Starting %s thread (from %s [%d])\n",
+                        thi->name, current->comm, current->pid);
  
                /* Get ref on module for thread - this is released when thread exits */
                if (!try_module_get(THIS_MODULE)) {
-                       dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
+                       conn_err(tconn, "Failed to get module reference in drbd_thread_start\n");
                        spin_unlock_irqrestore(&thi->t_lock, flags);
                        return false;
                }
  
+               kref_get(&thi->tconn->kref);
                init_completion(&thi->stop);
-               D_ASSERT(thi->task == NULL);
                thi->reset_cpu_mask = 1;
-               thi->t_state = Running;
+               thi->t_state = RUNNING;
                spin_unlock_irqrestore(&thi->t_lock, flags);
                flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
  
                nt = kthread_create(drbd_thread_setup, (void *) thi,
-                                   "drbd%d_%s", mdev_to_minor(mdev), me);
+                                   "drbd_%c_%s", thi->name[0], thi->tconn->name);
  
                if (IS_ERR(nt)) {
-                       dev_err(DEV, "Couldn't start thread\n");
+                       conn_err(tconn, "Couldn't start thread\n");
  
+                       kref_put(&tconn->kref, &conn_destroy);
                        module_put(THIS_MODULE);
                        return false;
                }
                spin_lock_irqsave(&thi->t_lock, flags);
                thi->task = nt;
-               thi->t_state = Running;
+               thi->t_state = RUNNING;
                spin_unlock_irqrestore(&thi->t_lock, flags);
                wake_up_process(nt);
                break;
-       case Exiting:
-               thi->t_state = Restarting;
-               dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
-                               me, current->comm, current->pid);
+       case EXITING:
+               thi->t_state = RESTARTING;
+               conn_info(tconn, "Restarting %s thread (from %s [%d])\n",
+                               thi->name, current->comm, current->pid);
                /* fall through */
-       case Running:
-       case Restarting:
+       case RUNNING:
+       case RESTARTING:
        default:
                spin_unlock_irqrestore(&thi->t_lock, flags);
                break;
@@@ -1882,12 -447,12 +447,12 @@@ void _drbd_thread_stop(struct drbd_thre
  {
        unsigned long flags;
  
-       enum drbd_thread_state ns = restart ? Restarting : Exiting;
+       enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
  
        /* may be called from state engine, holding the req lock irqsave */
        spin_lock_irqsave(&thi->t_lock, flags);
  
-       if (thi->t_state == None) {
+       if (thi->t_state == NONE) {
                spin_unlock_irqrestore(&thi->t_lock, flags);
                if (restart)
                        drbd_thread_start(thi);
                init_completion(&thi->stop);
                if (thi->task != current)
                        force_sig(DRBD_SIGKILL, thi->task);
        }
  
        spin_unlock_irqrestore(&thi->t_lock, flags);
                wait_for_completion(&thi->stop);
  }
  
+ static struct drbd_thread *drbd_task_to_thread(struct drbd_tconn *tconn, struct task_struct *task)
+ {
+       struct drbd_thread *thi =
+               task == tconn->receiver.task ? &tconn->receiver :
+               task == tconn->asender.task  ? &tconn->asender :
+               task == tconn->worker.task   ? &tconn->worker : NULL;
+       return thi;
+ }
+ char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task)
+ {
+       struct drbd_thread *thi = drbd_task_to_thread(tconn, task);
+       return thi ? thi->name : task->comm;
+ }
+ int conn_lowest_minor(struct drbd_tconn *tconn)
+ {
+       struct drbd_conf *mdev;
+       int vnr = 0, m;
+       rcu_read_lock();
+       mdev = idr_get_next(&tconn->volumes, &vnr);
+       m = mdev ? mdev_to_minor(mdev) : -1;
+       rcu_read_unlock();
+       return m;
+ }
  #ifdef CONFIG_SMP
  /**
   * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
   * Forces all threads of a device onto the same CPU. This is beneficial for
   * DRBD's performance. May be overwritten by user's configuration.
   */
- void drbd_calc_cpu_mask(struct drbd_conf *mdev)
+ void drbd_calc_cpu_mask(struct drbd_tconn *tconn)
  {
        int ord, cpu;
  
        /* user override. */
-       if (cpumask_weight(mdev->cpu_mask))
+       if (cpumask_weight(tconn->cpu_mask))
                return;
  
-       ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
+       ord = conn_lowest_minor(tconn) % cpumask_weight(cpu_online_mask);
        for_each_online_cpu(cpu) {
                if (ord-- == 0) {
-                       cpumask_set_cpu(cpu, mdev->cpu_mask);
+                       cpumask_set_cpu(cpu, tconn->cpu_mask);
                        return;
                }
        }
        /* should not be reached */
-       cpumask_setall(mdev->cpu_mask);
+       cpumask_setall(tconn->cpu_mask);
  }
  
  /**
   * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
   * @mdev:     DRBD device.
+  * @thi:      drbd_thread object
   *
   * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
   * prematurely.
   */
- void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
+ void drbd_thread_current_set_cpu(struct drbd_thread *thi)
  {
        struct task_struct *p = current;
-       struct drbd_thread *thi =
-               p == mdev->asender.task  ? &mdev->asender  :
-               p == mdev->receiver.task ? &mdev->receiver :
-               p == mdev->worker.task   ? &mdev->worker   :
-               NULL;
-       ERR_IF(thi == NULL)
-               return;
        if (!thi->reset_cpu_mask)
                return;
        thi->reset_cpu_mask = 0;
-       set_cpus_allowed_ptr(p, mdev->cpu_mask);
+       set_cpus_allowed_ptr(p, thi->tconn->cpu_mask);
  }
  #endif
  
- /* the appropriate socket mutex must be held already */
- int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
-                         enum drbd_packets cmd, struct p_header80 *h,
-                         size_t size, unsigned msg_flags)
+ /**
+  * drbd_header_size  -  size of a packet header
+  *
+  * The header size is a multiple of 8, so any payload following the header is
+  * word aligned on 64-bit architectures.  (The bitmap send and receive code
+  * relies on this.)
+  */
+ unsigned int drbd_header_size(struct drbd_tconn *tconn)
  {
-       int sent, ok;
+       if (tconn->agreed_pro_version >= 100) {
+               BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8));
+               return sizeof(struct p_header100);
+       } else {
+               BUILD_BUG_ON(sizeof(struct p_header80) !=
+                            sizeof(struct p_header95));
+               BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8));
+               return sizeof(struct p_header80);
+       }
+ }
  
-       ERR_IF(!h) return false;
-       ERR_IF(!size) return false;
+ static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size)
+ {
+       h->magic   = cpu_to_be32(DRBD_MAGIC);
+       h->command = cpu_to_be16(cmd);
+       h->length  = cpu_to_be16(size);
+       return sizeof(struct p_header80);
+ }
  
-       h->magic   = BE_DRBD_MAGIC;
+ static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size)
+ {
+       h->magic   = cpu_to_be16(DRBD_MAGIC_BIG);
        h->command = cpu_to_be16(cmd);
-       h->length  = cpu_to_be16(size-sizeof(struct p_header80));
+       h->length = cpu_to_be32(size);
+       return sizeof(struct p_header95);
+ }
  
-       sent = drbd_send(mdev, sock, h, size, msg_flags);
+ static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd,
+                                     int size, int vnr)
+ {
+       h->magic = cpu_to_be32(DRBD_MAGIC_100);
+       h->volume = cpu_to_be16(vnr);
+       h->command = cpu_to_be16(cmd);
+       h->length = cpu_to_be32(size);
+       h->pad = 0;
+       return sizeof(struct p_header100);
+ }
  
-       ok = (sent == size);
-       if (!ok && !signal_pending(current))
-               dev_warn(DEV, "short sent %s size=%d sent=%d\n",
-                   cmdname(cmd), (int)size, sent);
-       return ok;
+ static unsigned int prepare_header(struct drbd_tconn *tconn, int vnr,
+                                  void *buffer, enum drbd_packet cmd, int size)
+ {
+       if (tconn->agreed_pro_version >= 100)
+               return prepare_header100(buffer, cmd, size, vnr);
+       else if (tconn->agreed_pro_version >= 95 &&
+                size > DRBD_MAX_SIZE_H80_PACKET)
+               return prepare_header95(buffer, cmd, size);
+       else
+               return prepare_header80(buffer, cmd, size);
  }
  
- /* don't pass the socket. we may only look at it
-  * when we hold the appropriate socket mutex.
-  */
- int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
-                 enum drbd_packets cmd, struct p_header80 *h, size_t size)
+ static void *__conn_prepare_command(struct drbd_tconn *tconn,
+                                   struct drbd_socket *sock)
  {
-       int ok = 0;
-       struct socket *sock;
+       if (!sock->socket)
+               return NULL;
+       return sock->sbuf + drbd_header_size(tconn);
+ }
  
-       if (use_data_socket) {
-               mutex_lock(&mdev->data.mutex);
-               sock = mdev->data.socket;
-       } else {
-               mutex_lock(&mdev->meta.mutex);
-               sock = mdev->meta.socket;
-       }
+ void *conn_prepare_command(struct drbd_tconn *tconn, struct drbd_socket *sock)
+ {
+       void *p;
  
-       /* drbd_disconnect() could have called drbd_free_sock()
-        * while we were waiting in down()... */
-       if (likely(sock != NULL))
-               ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
+       mutex_lock(&sock->mutex);
+       p = __conn_prepare_command(tconn, sock);
+       if (!p)
+               mutex_unlock(&sock->mutex);
  
-       if (use_data_socket)
-               mutex_unlock(&mdev->data.mutex);
-       else
-               mutex_unlock(&mdev->meta.mutex);
-       return ok;
+       return p;
  }
  
- int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
-                  size_t size)
+ void *drbd_prepare_command(struct drbd_conf *mdev, struct drbd_socket *sock)
  {
-       struct p_header80 h;
-       int ok;
+       return conn_prepare_command(mdev->tconn, sock);
+ }
  
-       h.magic   = BE_DRBD_MAGIC;
-       h.command = cpu_to_be16(cmd);
-       h.length  = cpu_to_be16(size);
+ static int __send_command(struct drbd_tconn *tconn, int vnr,
+                         struct drbd_socket *sock, enum drbd_packet cmd,
+                         unsigned int header_size, void *data,
+                         unsigned int size)
+ {
+       int msg_flags;
+       int err;
  
-       if (!drbd_get_data_sock(mdev))
-               return 0;
+       /*
+        * Called with @data == NULL and the size of the data blocks in @size
+        * for commands that send data blocks.  For those commands, omit the
+        * MSG_MORE flag: this will increase the likelihood that data blocks
+        * which are page aligned on the sender will end up page aligned on the
+        * receiver.
+        */
+       msg_flags = data ? MSG_MORE : 0;
+       header_size += prepare_header(tconn, vnr, sock->sbuf, cmd,
+                                     header_size + size);
+       err = drbd_send_all(tconn, sock->socket, sock->sbuf, header_size,
+                           msg_flags);
+       if (data && !err)
+               err = drbd_send_all(tconn, sock->socket, data, size, 0);
+       return err;
+ }
+ static int __conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock,
+                              enum drbd_packet cmd, unsigned int header_size,
+                              void *data, unsigned int size)
+ {
+       return __send_command(tconn, 0, sock, cmd, header_size, data, size);
+ }
+ int conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock,
+                     enum drbd_packet cmd, unsigned int header_size,
+                     void *data, unsigned int size)
+ {
+       int err;
  
-       ok = (sizeof(h) ==
-               drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
-       ok = ok && (size ==
-               drbd_send(mdev, mdev->data.socket, data, size, 0));
+       err = __conn_send_command(tconn, sock, cmd, header_size, data, size);
+       mutex_unlock(&sock->mutex);
+       return err;
+ }
+ int drbd_send_command(struct drbd_conf *mdev, struct drbd_socket *sock,
+                     enum drbd_packet cmd, unsigned int header_size,
+                     void *data, unsigned int size)
+ {
+       int err;
+       err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, header_size,
+                            data, size);
+       mutex_unlock(&sock->mutex);
+       return err;
+ }
  
-       drbd_put_data_sock(mdev);
+ int drbd_send_ping(struct drbd_tconn *tconn)
+ {
+       struct drbd_socket *sock;
+       sock = &tconn->meta;
+       if (!conn_prepare_command(tconn, sock))
+               return -EIO;
+       return conn_send_command(tconn, sock, P_PING, 0, NULL, 0);
+ }
  
-       return ok;
+ int drbd_send_ping_ack(struct drbd_tconn *tconn)
+ {
+       struct drbd_socket *sock;
+       sock = &tconn->meta;
+       if (!conn_prepare_command(tconn, sock))
+               return -EIO;
+       return conn_send_command(tconn, sock, P_PING_ACK, 0, NULL, 0);
  }
  
- int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
+ int drbd_send_sync_param(struct drbd_conf *mdev)
  {
+       struct drbd_socket *sock;
        struct p_rs_param_95 *p;
-       struct socket *sock;
-       int size, rv;
-       const int apv = mdev->agreed_pro_version;
+       int size;
+       const int apv = mdev->tconn->agreed_pro_version;
+       enum drbd_packet cmd;
+       struct net_conf *nc;
+       struct disk_conf *dc;
+       sock = &mdev->tconn->data;
+       p = drbd_prepare_command(mdev, sock);
+       if (!p)
+               return -EIO;
+       rcu_read_lock();
+       nc = rcu_dereference(mdev->tconn->net_conf);
  
        size = apv <= 87 ? sizeof(struct p_rs_param)
                : apv == 88 ? sizeof(struct p_rs_param)
-                       + strlen(mdev->sync_conf.verify_alg) + 1
+                       + strlen(nc->verify_alg) + 1
                : apv <= 94 ? sizeof(struct p_rs_param_89)
                : /* apv >= 95 */ sizeof(struct p_rs_param_95);
  
-       /* used from admin command context and receiver/worker context.
-        * to avoid kmalloc, grab the socket right here,
-        * then use the pre-allocated sbuf there */
-       mutex_lock(&mdev->data.mutex);
-       sock = mdev->data.socket;
-       if (likely(sock != NULL)) {
-               enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
-               p = &mdev->data.sbuf.rs_param_95;
-               /* initialize verify_alg and csums_alg */
-               memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+       cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
  
-               p->rate = cpu_to_be32(sc->rate);
-               p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
-               p->c_delay_target = cpu_to_be32(sc->c_delay_target);
-               p->c_fill_target = cpu_to_be32(sc->c_fill_target);
-               p->c_max_rate = cpu_to_be32(sc->c_max_rate);
+       /* initialize verify_alg and csums_alg */
+       memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
  
-               if (apv >= 88)
-                       strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
-               if (apv >= 89)
-                       strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
-               rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
-       } else
-               rv = 0; /* not ok */
+       if (get_ldev(mdev)) {
+               dc = rcu_dereference(mdev->ldev->disk_conf);
+               p->resync_rate = cpu_to_be32(dc->resync_rate);
+               p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead);
+               p->c_delay_target = cpu_to_be32(dc->c_delay_target);
+               p->c_fill_target = cpu_to_be32(dc->c_fill_target);
+               p->c_max_rate = cpu_to_be32(dc->c_max_rate);
+               put_ldev(mdev);
+       } else {
+               p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF);
+               p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF);
+               p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF);
+               p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF);
+               p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF);
+       }
  
-       mutex_unlock(&mdev->data.mutex);
+       if (apv >= 88)
+               strcpy(p->verify_alg, nc->verify_alg);
+       if (apv >= 89)
+               strcpy(p->csums_alg, nc->csums_alg);
+       rcu_read_unlock();
  
-       return rv;
+       return drbd_send_command(mdev, sock, cmd, size, NULL, 0);
  }
  
- int drbd_send_protocol(struct drbd_conf *mdev)
+ int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd)
  {
+       struct drbd_socket *sock;
        struct p_protocol *p;
-       int size, cf, rv;
+       struct net_conf *nc;
+       int size, cf;
  
-       size = sizeof(struct p_protocol);
+       sock = &tconn->data;
+       p = __conn_prepare_command(tconn, sock);
+       if (!p)
+               return -EIO;
  
-       if (mdev->agreed_pro_version >= 87)
-               size += strlen(mdev->net_conf->integrity_alg) + 1;
+       rcu_read_lock();
+       nc = rcu_dereference(tconn->net_conf);
  
-       /* we must not recurse into our own queue,
-        * as that is blocked during handshake */
-       p = kmalloc(size, GFP_NOIO);
-       if (p == NULL)
-               return 0;
+       if (nc->tentative && tconn->agreed_pro_version < 92) {
+               rcu_read_unlock();
+               mutex_unlock(&sock->mutex);
+               conn_err(tconn, "--dry-run is not supported by peer");
+               return -EOPNOTSUPP;
+       }
  
-       p->protocol      = cpu_to_be32(mdev->net_conf->wire_protocol);
-       p->after_sb_0p   = cpu_to_be32(mdev->net_conf->after_sb_0p);
-       p->after_sb_1p   = cpu_to_be32(mdev->net_conf->after_sb_1p);
-       p->after_sb_2p   = cpu_to_be32(mdev->net_conf->after_sb_2p);
-       p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
+       size = sizeof(*p);
+       if (tconn->agreed_pro_version >= 87)
+               size += strlen(nc->integrity_alg) + 1;
  
+       p->protocol      = cpu_to_be32(nc->wire_protocol);
+       p->after_sb_0p   = cpu_to_be32(nc->after_sb_0p);
+       p->after_sb_1p   = cpu_to_be32(nc->after_sb_1p);
+       p->after_sb_2p   = cpu_to_be32(nc->after_sb_2p);
+       p->two_primaries = cpu_to_be32(nc->two_primaries);
        cf = 0;
-       if (mdev->net_conf->want_lose)
-               cf |= CF_WANT_LOSE;
-       if (mdev->net_conf->dry_run) {
-               if (mdev->agreed_pro_version >= 92)
-                       cf |= CF_DRY_RUN;
-               else {
-                       dev_err(DEV, "--dry-run is not supported by peer");
-                       kfree(p);
-                       return -1;
-               }
-       }
+       if (nc->discard_my_data)
+               cf |= CF_DISCARD_MY_DATA;
+       if (nc->tentative)
+               cf |= CF_DRY_RUN;
        p->conn_flags    = cpu_to_be32(cf);
  
-       if (mdev->agreed_pro_version >= 87)
-               strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
+       if (tconn->agreed_pro_version >= 87)
+               strcpy(p->integrity_alg, nc->integrity_alg);
+       rcu_read_unlock();
  
-       rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
-                          (struct p_header80 *)p, size);
-       kfree(p);
-       return rv;
+       return __conn_send_command(tconn, sock, cmd, size, NULL, 0);
+ }
+ int drbd_send_protocol(struct drbd_tconn *tconn)
+ {
+       int err;
+       mutex_lock(&tconn->data.mutex);
+       err = __drbd_send_protocol(tconn, P_PROTOCOL);
+       mutex_unlock(&tconn->data.mutex);
+       return err;
  }
  
  int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
  {
-       struct p_uuids p;
+       struct drbd_socket *sock;
+       struct p_uuids *p;
        int i;
  
        if (!get_ldev_if_state(mdev, D_NEGOTIATING))
-               return 1;
+               return 0;
  
+       sock = &mdev->tconn->data;
+       p = drbd_prepare_command(mdev, sock);
+       if (!p) {
+               put_ldev(mdev);
+               return -EIO;
+       }
        spin_lock_irq(&mdev->ldev->md.uuid_lock);
        for (i = UI_CURRENT; i < UI_SIZE; i++)
-               p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
+               p->uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
        spin_unlock_irq(&mdev->ldev->md.uuid_lock);
  
        mdev->comm_bm_set = drbd_bm_total_weight(mdev);
-       p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
-       uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
-       uuid_flags |= drbd_test_flag(mdev, CRASHED_PRIMARY) ? 2 : 0;
+       p->uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
+       rcu_read_lock();
+       uuid_flags |= rcu_dereference(mdev->tconn->net_conf)->discard_my_data ? 1 : 0;
+       rcu_read_unlock();
+       uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
        uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
-       p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
+       p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
  
        put_ldev(mdev);
-       return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
-                            (struct p_header80 *)&p, sizeof(p));
+       return drbd_send_command(mdev, sock, P_UUIDS, sizeof(*p), NULL, 0);
  }
  
  int drbd_send_uuids(struct drbd_conf *mdev)
@@@ -2186,9 -884,10 +884,10 @@@ void drbd_print_uuids(struct drbd_conf 
        }
  }
  
int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
  {
-       struct p_rs_uuid p;
+       struct drbd_socket *sock;
+       struct p_rs_uuid *p;
        u64 uuid;
  
        D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
        drbd_uuid_set(mdev, UI_BITMAP, uuid);
        drbd_print_uuids(mdev, "updated sync UUID");
        drbd_md_sync(mdev);
-       p.uuid = cpu_to_be64(uuid);
  
-       return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
-                            (struct p_header80 *)&p, sizeof(p));
+       sock = &mdev->tconn->data;
+       p = drbd_prepare_command(mdev, sock);
+       if (p) {
+               p->uuid = cpu_to_be64(uuid);
+               drbd_send_command(mdev, sock, P_SYNC_UUID, sizeof(*p), NULL, 0);
+       }
  }
  
  int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
  {
-       struct p_sizes p;
+       struct drbd_socket *sock;
+       struct p_sizes *p;
        sector_t d_size, u_size;
 -      int q_order_type, max_bio_size;
 +      int q_order_type;
 +      unsigned int max_bio_size;
-       int ok;
  
        if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
                D_ASSERT(mdev->ldev->backing_bdev);
                d_size = drbd_get_max_capacity(mdev->ldev);
-               u_size = mdev->ldev->dc.disk_size;
+               rcu_read_lock();
+               u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+               rcu_read_unlock();
                q_order_type = drbd_queue_order_type(mdev);
                max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
 -              max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
 +              max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
                put_ldev(mdev);
        } else {
                d_size = 0;
                max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
        }
  
-       /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
-       if (mdev->agreed_pro_version <= 94)
-               max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+       sock = &mdev->tconn->data;
+       p = drbd_prepare_command(mdev, sock);
+       if (!p)
+               return -EIO;
  
-       p.d_size = cpu_to_be64(d_size);
-       p.u_size = cpu_to_be64(u_size);
-       p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
-       p.max_bio_size = cpu_to_be32(max_bio_size);
-       p.queue_order_type = cpu_to_be16(q_order_type);
-       p.dds_flags = cpu_to_be16(flags);
+       if (mdev->tconn->agreed_pro_version <= 94)
 -              max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
++              max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+       else if (mdev->tconn->agreed_pro_version < 100)
 -              max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE_P95);
++              max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95);
  
-       ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
-                          (struct p_header80 *)&p, sizeof(p));
-       return ok;
+       p->d_size = cpu_to_be64(d_size);
+       p->u_size = cpu_to_be64(u_size);
+       p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
+       p->max_bio_size = cpu_to_be32(max_bio_size);
+       p->queue_order_type = cpu_to_be16(q_order_type);
+       p->dds_flags = cpu_to_be16(flags);
+       return drbd_send_command(mdev, sock, P_SIZES, sizeof(*p), NULL, 0);
  }
  
  /**
   */
  int drbd_send_current_state(struct drbd_conf *mdev)
  {
-       struct socket *sock;
-       struct p_state p;
-       int ok = 0;
-       /* Grab state lock so we wont send state if we're in the middle
-        * of a cluster wide state change on another thread */
-       drbd_state_lock(mdev);
-       mutex_lock(&mdev->data.mutex);
-       p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
-       sock = mdev->data.socket;
-       if (likely(sock != NULL)) {
-               ok = _drbd_send_cmd(mdev, sock, P_STATE,
-                                   (struct p_header80 *)&p, sizeof(p), 0);
-       }
-       mutex_unlock(&mdev->data.mutex);
+       struct drbd_socket *sock;
+       struct p_state *p;
  
-       drbd_state_unlock(mdev);
-       return ok;
+       sock = &mdev->tconn->data;
+       p = drbd_prepare_command(mdev, sock);
+       if (!p)
+               return -EIO;
+       p->state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
+       return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0);
  }
  
  /**
   * drbd_send_state() - After a state change, sends the new state to the peer
-  * @mdev:     DRBD device.
-  * @state:    the state to send, not necessarily the current state.
+  * @mdev:      DRBD device.
+  * @state:     the state to send, not necessarily the current state.
   *
   * Each state change queues an "after_state_ch" work, which will eventually
   * send the resulting new state to the peer. If more state changes happen
   */
  int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
  {
-       struct socket *sock;
-       struct p_state p;
-       int ok = 0;
+       struct drbd_socket *sock;
+       struct p_state *p;
  
-       mutex_lock(&mdev->data.mutex);
+       sock = &mdev->tconn->data;
+       p = drbd_prepare_command(mdev, sock);
+       if (!p)
+               return -EIO;
+       p->state = cpu_to_be32(state.i); /* Within the send mutex */
+       return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0);
+ }
  
-       p.state = cpu_to_be32(state.i);
-       sock = mdev->data.socket;
+ int drbd_send_state_req(struct drbd_conf *mdev, union drbd_state mask, union drbd_state val)
+ {
+       struct drbd_socket *sock;
+       struct p_req_state *p;
  
-       if (likely(sock != NULL)) {
-               ok = _drbd_send_cmd(mdev, sock, P_STATE,
-                                   (struct p_header80 *)&p, sizeof(p), 0);
-       }
+       sock = &mdev->tconn->data;
+       p = drbd_prepare_command(mdev, sock);
+       if (!p)
+               return -EIO;
+       p->mask = cpu_to_be32(mask.i);
+       p->val = cpu_to_be32(val.i);
+       return drbd_send_command(mdev, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0);
+ }
  
-       mutex_unlock(&mdev->data.mutex);
+ int conn_send_state_req(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val)
+ {
+       enum drbd_packet cmd;
+       struct drbd_socket *sock;
+       struct p_req_state *p;
  
-       return ok;
+       cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ;
+       sock = &tconn->data;
+       p = conn_prepare_command(tconn, sock);
+       if (!p)
+               return -EIO;
+       p->mask = cpu_to_be32(mask.i);
+       p->val = cpu_to_be32(val.i);
+       return conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0);
  }
  
- int drbd_send_state_req(struct drbd_conf *mdev,
-       union drbd_state mask, union drbd_state val)
+ void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
  {
-       struct p_req_state p;
+       struct drbd_socket *sock;
+       struct p_req_state_reply *p;
+       sock = &mdev->tconn->meta;
+       p = drbd_prepare_command(mdev, sock);
+       if (p) {
+               p->retcode = cpu_to_be32(retcode);
+               drbd_send_command(mdev, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0);
+       }
+ }
  
-       p.mask    = cpu_to_be32(mask.i);
-       p.val     = cpu_to_be32(val.i);
+ void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode)
+ {
+       struct drbd_socket *sock;
+       struct p_req_state_reply *p;
+       enum drbd_packet cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
  
-       return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
-                            (struct p_header80 *)&p, sizeof(p));
+       sock = &tconn->meta;
+       p = conn_prepare_command(tconn, sock);
+       if (p) {
+               p->retcode = cpu_to_be32(retcode);
+               conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0);
+       }
  }
  
int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
  {
-       struct p_req_state_reply p;
+       BUG_ON(code & ~0xf);
+       p->encoding = (p->encoding & ~0xf) | code;
+ }
  
-       p.retcode    = cpu_to_be32(retcode);
+ static void dcbp_set_start(struct p_compressed_bm *p, int set)
+ {
+       p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
+ }
  
-       return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
-                            (struct p_header80 *)&p, sizeof(p));
+ static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n)
+ {
+       BUG_ON(n & ~0x7);
+       p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
  }
  
  int fill_bitmap_rle_bits(struct drbd_conf *mdev,
-       struct p_compressed_bm *p,
-       struct bm_xfer_ctx *c)
+                        struct p_compressed_bm *p,
+                        unsigned int size,
+                        struct bm_xfer_ctx *c)
  {
        struct bitstream bs;
        unsigned long plain_bits;
        unsigned long rl;
        unsigned len;
        unsigned toggle;
-       int bits;
+       int bits, use_rle;
  
        /* may we use this feature? */
-       if ((mdev->sync_conf.use_rle == 0) ||
-               (mdev->agreed_pro_version < 90))
-                       return 0;
+       rcu_read_lock();
+       use_rle = rcu_dereference(mdev->tconn->net_conf)->use_rle;
+       rcu_read_unlock();
+       if (!use_rle || mdev->tconn->agreed_pro_version < 90)
+               return 0;
  
        if (c->bit_offset >= c->bm_bits)
                return 0; /* nothing to do. */
  
        /* use at most thus many bytes */
-       bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
-       memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
+       bitstream_init(&bs, p->code, size, 0);
+       memset(p->code, 0, size);
        /* plain bits covered in this code string */
        plain_bits = 0;
  
                        if (rl == 0) {
                                /* the first checked bit was set,
                                 * store start value, */
-                               DCBP_set_start(p, 1);
+                               dcbp_set_start(p, 1);
                                /* but skip encoding of zero run length */
                                toggle = !toggle;
                                continue;
                        }
-                       DCBP_set_start(p, 0);
+                       dcbp_set_start(p, 0);
                }
  
                /* paranoia: catch zero runlength.
        bm_xfer_ctx_bit_to_word_offset(c);
  
        /* store pad_bits */
-       DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
+       dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
  
        return len;
  }
   * code upon failure.
   */
  static int
- send_bitmap_rle_or_plain(struct drbd_conf *mdev,
-                        struct p_header80 *h, struct bm_xfer_ctx *c)
+ send_bitmap_rle_or_plain(struct drbd_conf *mdev, struct bm_xfer_ctx *c)
  {
-       struct p_compressed_bm *p = (void*)h;
-       unsigned long num_words;
-       int len;
-       int ok;
-       len = fill_bitmap_rle_bits(mdev, p, c);
+       struct drbd_socket *sock = &mdev->tconn->data;
+       unsigned int header_size = drbd_header_size(mdev->tconn);
+       struct p_compressed_bm *p = sock->sbuf + header_size;
+       int len, err;
  
+       len = fill_bitmap_rle_bits(mdev, p,
+                       DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c);
        if (len < 0)
                return -EIO;
  
        if (len) {
-               DCBP_set_code(p, RLE_VLI_Bits);
-               ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
-                       sizeof(*p) + len, 0);
+               dcbp_set_code(p, RLE_VLI_Bits);
+               err = __send_command(mdev->tconn, mdev->vnr, sock,
+                                    P_COMPRESSED_BITMAP, sizeof(*p) + len,
+                                    NULL, 0);
                c->packets[0]++;
-               c->bytes[0] += sizeof(*p) + len;
+               c->bytes[0] += header_size + sizeof(*p) + len;
  
                if (c->bit_offset >= c->bm_bits)
                        len = 0; /* DONE */
        } else {
                /* was not compressible.
                 * send a buffer full of plain text bits instead. */
-               num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
-               len = num_words * sizeof(long);
+               unsigned int data_size;
+               unsigned long num_words;
+               unsigned long *p = sock->sbuf + header_size;
+               data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+               num_words = min_t(size_t, data_size / sizeof(*p),
+                                 c->bm_words - c->word_offset);
+               len = num_words * sizeof(*p);
                if (len)
-                       drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
-               ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
-                                  h, sizeof(struct p_header80) + len, 0);
+                       drbd_bm_get_lel(mdev, c->word_offset, num_words, p);
+               err = __send_command(mdev->tconn, mdev->vnr, sock, P_BITMAP, len, NULL, 0);
                c->word_offset += num_words;
                c->bit_offset = c->word_offset * BITS_PER_LONG;
  
                c->packets[1]++;
-               c->bytes[1] += sizeof(struct p_header80) + len;
+               c->bytes[1] += header_size + len;
  
                if (c->bit_offset > c->bm_bits)
                        c->bit_offset = c->bm_bits;
        }
-       if (ok) {
+       if (!err) {
                if (len == 0) {
                        INFO_bm_xfer_stats(mdev, "send", c);
                        return 0;
  }
  
  /* See the comment at receive_bitmap() */
- int _drbd_send_bitmap(struct drbd_conf *mdev)
static int _drbd_send_bitmap(struct drbd_conf *mdev)
  {
        struct bm_xfer_ctx c;
-       struct p_header80 *p;
        int err;
  
-       ERR_IF(!mdev->bitmap) return false;
-       /* maybe we should use some per thread scratch page,
-        * and allocate that during initial device creation? */
-       p = (struct p_header80 *) __get_free_page(GFP_NOIO);
-       if (!p) {
-               dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
+       if (!expect(mdev->bitmap))
                return false;
-       }
  
        if (get_ldev(mdev)) {
                if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
        };
  
        do {
-               err = send_bitmap_rle_or_plain(mdev, p, &c);
+               err = send_bitmap_rle_or_plain(mdev, &c);
        } while (err > 0);
  
-       free_page((unsigned long) p);
        return err == 0;
  }
  
  int drbd_send_bitmap(struct drbd_conf *mdev)
  {
-       int err;
+       struct drbd_socket *sock = &mdev->tconn->data;
+       int err = -1;
  
-       if (!drbd_get_data_sock(mdev))
-               return -1;
-       err = !_drbd_send_bitmap(mdev);
-       drbd_put_data_sock(mdev);
+       mutex_lock(&sock->mutex);
+       if (sock->socket)
+               err = !_drbd_send_bitmap(mdev);
+       mutex_unlock(&sock->mutex);
        return err;
  }
  
int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr, u32 set_size)
  {
-       int ok;
-       struct p_barrier_ack p;
+       struct drbd_socket *sock;
+       struct p_barrier_ack *p;
  
-       p.barrier  = barrier_nr;
-       p.set_size = cpu_to_be32(set_size);
+       if (tconn->cstate < C_WF_REPORT_PARAMS)
+               return;
  
-       if (mdev->state.conn < C_CONNECTED)
-               return false;
-       ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
-                       (struct p_header80 *)&p, sizeof(p));
-       return ok;
+       sock = &tconn->meta;
+       p = conn_prepare_command(tconn, sock);
+       if (!p)
+               return;
+       p->barrier = barrier_nr;
+       p->set_size = cpu_to_be32(set_size);
+       conn_send_command(tconn, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0);
  }
  
  /**
   * @blksize:  size in byte, needs to be in big endian byte order
   * @block_id: Id, big endian byte order
   */
- static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
-                         u64 sector,
-                         u32 blksize,
-                         u64 block_id)
+ static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
+                         u64 sector, u32 blksize, u64 block_id)
  {
-       int ok;
-       struct p_block_ack p;
+       struct drbd_socket *sock;
+       struct p_block_ack *p;
  
-       p.sector   = sector;
-       p.block_id = block_id;
-       p.blksize  = blksize;
-       p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
+       if (mdev->state.conn < C_CONNECTED)
+               return -EIO;
  
-       if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
-               return false;
-       ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
-                               (struct p_header80 *)&p, sizeof(p));
-       return ok;
+       sock = &mdev->tconn->meta;
+       p = drbd_prepare_command(mdev, sock);
+       if (!p)
+               return -EIO;
+       p->sector = sector;
+       p->block_id = block_id;
+       p->blksize = blksize;
+       p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq));
+       return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0);
  }
  
  /* dp->sector and dp->block_id already/still in network byte order,
   * data_size is payload size according to dp->head,
   * and may need to be corrected for digest size. */
int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
-                    struct p_data *dp, int data_size)
void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd,
+                     struct p_data *dp, int data_size)
  {
-       data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
-               crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
-       return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
-                             dp->block_id);
+       if (mdev->tconn->peer_integrity_tfm)
+               data_size -= crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
+       _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
+                      dp->block_id);
  }
  
int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
-                    struct p_block_req *rp)
void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd,
+                     struct p_block_req *rp)
  {
-       return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
+       _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
  }
  
  /**
   * drbd_send_ack() - Sends an ack packet
-  * @mdev:     DRBD device.
-  * @cmd:      Packet command code.
-  * @e:                Epoch entry.
+  * @mdev:     DRBD device
+  * @cmd:      packet command code
+  * @peer_req: peer request
   */
- int drbd_send_ack(struct drbd_conf *mdev,
-       enum drbd_packets cmd, struct drbd_epoch_entry *e)
+ int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
+                 struct drbd_peer_request *peer_req)
  {
        return _drbd_send_ack(mdev, cmd,
-                             cpu_to_be64(e->sector),
-                             cpu_to_be32(e->size),
-                             e->block_id);
+                             cpu_to_be64(peer_req->i.sector),
+                             cpu_to_be32(peer_req->i.size),
+                             peer_req->block_id);
  }
  
  /* This function misuses the block_id field to signal if the blocks
   * are is sync or not. */
- int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+ int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd,
                     sector_t sector, int blksize, u64 block_id)
  {
        return _drbd_send_ack(mdev, cmd,
  int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
                       sector_t sector, int size, u64 block_id)
  {
-       int ok;
-       struct p_block_req p;
-       p.sector   = cpu_to_be64(sector);
-       p.block_id = block_id;
-       p.blksize  = cpu_to_be32(size);
+       struct drbd_socket *sock;
+       struct p_block_req *p;
  
-       ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
-                               (struct p_header80 *)&p, sizeof(p));
-       return ok;
+       sock = &mdev->tconn->data;
+       p = drbd_prepare_command(mdev, sock);
+       if (!p)
+               return -EIO;
+       p->sector = cpu_to_be64(sector);
+       p->block_id = block_id;
+       p->blksize = cpu_to_be32(size);
+       return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0);
  }
  
- int drbd_send_drequest_csum(struct drbd_conf *mdev,
-                           sector_t sector, int size,
-                           void *digest, int digest_size,
-                           enum drbd_packets cmd)
+ int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, int size,
+                           void *digest, int digest_size, enum drbd_packet cmd)
  {
-       int ok;
-       struct p_block_req p;
-       p.sector   = cpu_to_be64(sector);
-       p.block_id = BE_DRBD_MAGIC + 0xbeef;
-       p.blksize  = cpu_to_be32(size);
+       struct drbd_socket *sock;
+       struct p_block_req *p;
  
-       p.head.magic   = BE_DRBD_MAGIC;
-       p.head.command = cpu_to_be16(cmd);
-       p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
+       /* FIXME: Put the digest into the preallocated socket buffer.  */
  
-       mutex_lock(&mdev->data.mutex);
-       ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
-       ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
-       mutex_unlock(&mdev->data.mutex);
-       return ok;
+       sock = &mdev->tconn->data;
+       p = drbd_prepare_command(mdev, sock);
+       if (!p)
+               return -EIO;
+       p->sector = cpu_to_be64(sector);
+       p->block_id = ID_SYNCER /* unused */;
+       p->blksize = cpu_to_be32(size);
+       return drbd_send_command(mdev, sock, cmd, sizeof(*p),
+                                digest, digest_size);
  }
  
  int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
  {
-       int ok;
-       struct p_block_req p;
+       struct drbd_socket *sock;
+       struct p_block_req *p;
  
-       p.sector   = cpu_to_be64(sector);
-       p.block_id = BE_DRBD_MAGIC + 0xbabe;
-       p.blksize  = cpu_to_be32(size);
-       ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
-                          (struct p_header80 *)&p, sizeof(p));
-       return ok;
+       sock = &mdev->tconn->data;
+       p = drbd_prepare_command(mdev, sock);
+       if (!p)
+               return -EIO;
+       p->sector = cpu_to_be64(sector);
+       p->block_id = ID_SYNCER /* unused */;
+       p->blksize = cpu_to_be32(size);
+       return drbd_send_command(mdev, sock, P_OV_REQUEST, sizeof(*p), NULL, 0);
  }
  
  /* called on sndtimeo
   * returns false if we should retry,
   * true if we think connection is dead
   */
- static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
+ static int we_should_drop_the_connection(struct drbd_tconn *tconn, struct socket *sock)
  {
        int drop_it;
        /* long elapsed = (long)(jiffies - mdev->last_received); */
  
-       drop_it =   mdev->meta.socket == sock
-               || !mdev->asender.task
-               || get_t_state(&mdev->asender) != Running
-               || mdev->state.conn < C_CONNECTED;
+       drop_it =   tconn->meta.socket == sock
+               || !tconn->asender.task
+               || get_t_state(&tconn->asender) != RUNNING
+               || tconn->cstate < C_WF_REPORT_PARAMS;
  
        if (drop_it)
                return true;
  
-       drop_it = !--mdev->ko_count;
+       drop_it = !--tconn->ko_count;
        if (!drop_it) {
-               dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
-                      current->comm, current->pid, mdev->ko_count);
-               request_ping(mdev);
+               conn_err(tconn, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
+                        current->comm, current->pid, tconn->ko_count);
+               request_ping(tconn);
        }
  
        return drop_it; /* && (mdev->state == R_PRIMARY) */;
  }
  
+ static void drbd_update_congested(struct drbd_tconn *tconn)
+ {
+       struct sock *sk = tconn->data.socket->sk;
+       if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
+               set_bit(NET_CONGESTED, &tconn->flags);
+ }
  /* The idea of sendpage seems to be to put some kind of reference
   * to the page into the skb, and to hand it over to the NIC. In
   * this process get_page() gets called.
   * with page_count == 0 or PageSlab.
   */
  static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
-                  int offset, size_t size, unsigned msg_flags)
+                             int offset, size_t size, unsigned msg_flags)
  {
-       int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
+       struct socket *socket;
+       void *addr;
+       int err;
+       socket = mdev->tconn->data.socket;
+       addr = kmap(page) + offset;
+       err = drbd_send_all(mdev->tconn, socket, addr, size, msg_flags);
        kunmap(page);
-       if (sent == size)
-               mdev->send_cnt += size>>9;
-       return sent == size;
+       if (!err)
+               mdev->send_cnt += size >> 9;
+       return err;
  }
  
  static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
                    int offset, size_t size, unsigned msg_flags)
  {
+       struct socket *socket = mdev->tconn->data.socket;
        mm_segment_t oldfs = get_fs();
-       int sent, ok;
        int len = size;
+       int err = -EIO;
  
        /* e.g. XFS meta- & log-data is in slab pages, which have a
         * page_count of 0 and/or have PageSlab() set.
                return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
  
        msg_flags |= MSG_NOSIGNAL;
-       drbd_update_congested(mdev);
+       drbd_update_congested(mdev->tconn);
        set_fs(KERNEL_DS);
        do {
-               sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
-                                                       offset, len,
-                                                       msg_flags);
-               if (sent == -EAGAIN) {
-                       if (we_should_drop_the_connection(mdev,
-                                                         mdev->data.socket))
-                               break;
-                       else
-                               continue;
-               }
+               int sent;
+               sent = socket->ops->sendpage(socket, page, offset, len, msg_flags);
                if (sent <= 0) {
+                       if (sent == -EAGAIN) {
+                               if (we_should_drop_the_connection(mdev->tconn, socket))
+                                       break;
+                               continue;
+                       }
                        dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
                             __func__, (int)size, len, sent);
+                       if (sent < 0)
+                               err = sent;
                        break;
                }
                len    -= sent;
                offset += sent;
        } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
        set_fs(oldfs);
-       drbd_clear_flag(mdev, NET_CONGESTED);
+       clear_bit(NET_CONGESTED, &mdev->tconn->flags);
  
-       ok = (len == 0);
-       if (likely(ok))
-               mdev->send_cnt += size>>9;
-       return ok;
+       if (len == 0) {
+               err = 0;
+               mdev->send_cnt += size >> 9;
+       }
+       return err;
  }
  
  static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
        int i;
        /* hint all but last page with MSG_MORE */
        bio_for_each_segment(bvec, bio, i) {
-               if (!_drbd_no_send_page(mdev, bvec->bv_page,
-                                    bvec->bv_offset, bvec->bv_len,
-                                    i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
-                       return 0;
+               int err;
+               err = _drbd_no_send_page(mdev, bvec->bv_page,
+                                        bvec->bv_offset, bvec->bv_len,
+                                        i == bio->bi_vcnt - 1 ? 0 : MSG_MORE);
+               if (err)
+                       return err;
        }
-       return 1;
+       return 0;
  }
  
  static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
        int i;
        /* hint all but last page with MSG_MORE */
        bio_for_each_segment(bvec, bio, i) {
-               if (!_drbd_send_page(mdev, bvec->bv_page,
-                                    bvec->bv_offset, bvec->bv_len,
-                                    i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
-                       return 0;
+               int err;
+               err = _drbd_send_page(mdev, bvec->bv_page,
+                                     bvec->bv_offset, bvec->bv_len,
+                                     i == bio->bi_vcnt - 1 ? 0 : MSG_MORE);
+               if (err)
+                       return err;
        }
-       return 1;
+       return 0;
  }
  
- static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+ static int _drbd_send_zc_ee(struct drbd_conf *mdev,
+                           struct drbd_peer_request *peer_req)
  {
-       struct page *page = e->pages;
-       unsigned len = e->size;
+       struct page *page = peer_req->pages;
+       unsigned len = peer_req->i.size;
+       int err;
        /* hint all but last page with MSG_MORE */
        page_chain_for_each(page) {
                unsigned l = min_t(unsigned, len, PAGE_SIZE);
-               if (!_drbd_send_page(mdev, page, 0, l,
-                               page_chain_next(page) ? MSG_MORE : 0))
-                       return 0;
+               err = _drbd_send_page(mdev, page, 0, l,
+                                     page_chain_next(page) ? MSG_MORE : 0);
+               if (err)
+                       return err;
                len -= l;
        }
-       return 1;
+       return 0;
  }
  
  static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
  {
-       if (mdev->agreed_pro_version >= 95)
+       if (mdev->tconn->agreed_pro_version >= 95)
                return  (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
                        (bi_rw & REQ_FUA ? DP_FUA : 0) |
                        (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
   */
  int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
  {
-       int ok = 1;
-       struct p_data p;
+       struct drbd_socket *sock;
+       struct p_data *p;
        unsigned int dp_flags = 0;
-       void *dgb;
        int dgs;
+       int err;
  
-       if (!drbd_get_data_sock(mdev))
-               return 0;
-       dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
-               crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
-       if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
-               p.head.h80.magic   = BE_DRBD_MAGIC;
-               p.head.h80.command = cpu_to_be16(P_DATA);
-               p.head.h80.length  =
-                       cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
-       } else {
-               p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
-               p.head.h95.command = cpu_to_be16(P_DATA);
-               p.head.h95.length  =
-                       cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
-       }
-       p.sector   = cpu_to_be64(req->sector);
-       p.block_id = (unsigned long)req;
-       p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
+       sock = &mdev->tconn->data;
+       p = drbd_prepare_command(mdev, sock);
+       dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0;
  
+       if (!p)
+               return -EIO;
+       p->sector = cpu_to_be64(req->i.sector);
+       p->block_id = (unsigned long)req;
+       p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq));
        dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
        if (mdev->state.conn >= C_SYNC_SOURCE &&
            mdev->state.conn <= C_PAUSED_SYNC_T)
                dp_flags |= DP_MAY_SET_IN_SYNC;
-       p.dp_flags = cpu_to_be32(dp_flags);
-       drbd_set_flag(mdev, UNPLUG_REMOTE);
-       ok = (sizeof(p) ==
-               drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
-       if (ok && dgs) {
-               dgb = mdev->int_dig_out;
-               drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
-               ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
-       }
-       if (ok) {
+       if (mdev->tconn->agreed_pro_version >= 100) {
+               if (req->rq_state & RQ_EXP_RECEIVE_ACK)
+                       dp_flags |= DP_SEND_RECEIVE_ACK;
+               if (req->rq_state & RQ_EXP_WRITE_ACK)
+                       dp_flags |= DP_SEND_WRITE_ACK;
+       }
+       p->dp_flags = cpu_to_be32(dp_flags);
+       if (dgs)
+               drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, p + 1);
+       err = __send_command(mdev->tconn, mdev->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size);
+       if (!err) {
                /* For protocol A, we have to memcpy the payload into
                 * socket buffers, as we may complete right away
                 * as soon as we handed it over to tcp, at which point the data
                 * out ok after sending on this side, but does not fit on the
                 * receiving side, we sure have detected corruption elsewhere.
                 */
-               if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
-                       ok = _drbd_send_bio(mdev, req->master_bio);
+               if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || dgs)
+                       err = _drbd_send_bio(mdev, req->master_bio);
                else
-                       ok = _drbd_send_zc_bio(mdev, req->master_bio);
+                       err = _drbd_send_zc_bio(mdev, req->master_bio);
  
                /* double check digest, sometimes buffers have been modified in flight. */
                if (dgs > 0 && dgs <= 64) {
                        /* 64 byte, 512 bit, is the largest digest size
                         * currently supported in kernel crypto. */
                        unsigned char digest[64];
-                       drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
-                       if (memcmp(mdev->int_dig_out, digest, dgs)) {
+                       drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, digest);
+                       if (memcmp(p + 1, digest, dgs)) {
                                dev_warn(DEV,
                                        "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
-                                       (unsigned long long)req->sector, req->size);
+                                       (unsigned long long)req->i.sector, req->i.size);
                        }
                } /* else if (dgs > 64) {
                     ... Be noisy about digest too large ...
                } */
        }
+       mutex_unlock(&sock->mutex);  /* locked by drbd_prepare_command() */
  
-       drbd_put_data_sock(mdev);
-       return ok;
+       return err;
  }
  
  /* answer packet, used to send data back for read requests:
   *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
   *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
   */
- int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
-                   struct drbd_epoch_entry *e)
+ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packet cmd,
+                   struct drbd_peer_request *peer_req)
  {
-       int ok;
-       struct p_data p;
-       void *dgb;
+       struct drbd_socket *sock;
+       struct p_data *p;
+       int err;
        int dgs;
  
-       dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
-               crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
-       if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
-               p.head.h80.magic   = BE_DRBD_MAGIC;
-               p.head.h80.command = cpu_to_be16(cmd);
-               p.head.h80.length  =
-                       cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
-       } else {
-               p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
-               p.head.h95.command = cpu_to_be16(cmd);
-               p.head.h95.length  =
-                       cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
-       }
-       p.sector   = cpu_to_be64(e->sector);
-       p.block_id = e->block_id;
-       /* p.seq_num  = 0;    No sequence numbers here.. */
-       /* Only called by our kernel thread.
-        * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
-        * in response to admin command or module unload.
-        */
-       if (!drbd_get_data_sock(mdev))
-               return 0;
+       sock = &mdev->tconn->data;
+       p = drbd_prepare_command(mdev, sock);
  
-       ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
-       if (ok && dgs) {
-               dgb = mdev->int_dig_out;
-               drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
-               ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
-       }
-       if (ok)
-               ok = _drbd_send_zc_ee(mdev, e);
+       dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0;
  
-       drbd_put_data_sock(mdev);
+       if (!p)
+               return -EIO;
+       p->sector = cpu_to_be64(peer_req->i.sector);
+       p->block_id = peer_req->block_id;
+       p->seq_num = 0;  /* unused */
+       p->dp_flags = 0;
+       if (dgs)
+               drbd_csum_ee(mdev, mdev->tconn->integrity_tfm, peer_req, p + 1);
+       err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, sizeof(*p) + dgs, NULL, peer_req->i.size);
+       if (!err)
+               err = _drbd_send_zc_ee(mdev, peer_req);
+       mutex_unlock(&sock->mutex);  /* locked by drbd_prepare_command() */
  
-       return ok;
+       return err;
  }
  
- int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
+ int drbd_send_out_of_sync(struct drbd_conf *mdev, struct drbd_request *req)
  {
-       struct p_block_desc p;
+       struct drbd_socket *sock;
+       struct p_block_desc *p;
  
-       p.sector  = cpu_to_be64(req->sector);
-       p.blksize = cpu_to_be32(req->size);
-       return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
+       sock = &mdev->tconn->data;
+       p = drbd_prepare_command(mdev, sock);
+       if (!p)
+               return -EIO;
+       p->sector = cpu_to_be64(req->i.sector);
+       p->blksize = cpu_to_be32(req->i.size);
+       return drbd_send_command(mdev, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0);
  }
  
  /*
  /*
   * you must have down()ed the appropriate [m]sock_mutex elsewhere!
   */
- int drbd_send(struct drbd_conf *mdev, struct socket *sock,
+ int drbd_send(struct drbd_tconn *tconn, struct socket *sock,
              void *buf, size_t size, unsigned msg_flags)
  {
        struct kvec iov;
        int rv, sent = 0;
  
        if (!sock)
-               return -1000;
+               return -EBADR;
  
        /* THINK  if (signal_pending) return ... ? */
  
        msg.msg_controllen = 0;
        msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
  
-       if (sock == mdev->data.socket) {
-               mdev->ko_count = mdev->net_conf->ko_count;
-               drbd_update_congested(mdev);
+       if (sock == tconn->data.socket) {
+               rcu_read_lock();
+               tconn->ko_count = rcu_dereference(tconn->net_conf)->ko_count;
+               rcu_read_unlock();
+               drbd_update_congested(tconn);
        }
        do {
                /* STRANGE
   */
                rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
                if (rv == -EAGAIN) {
-                       if (we_should_drop_the_connection(mdev, sock))
+                       if (we_should_drop_the_connection(tconn, sock))
                                break;
                        else
                                continue;
                }
-               D_ASSERT(rv != 0);
                if (rv == -EINTR) {
                        flush_signals(current);
                        rv = 0;
                iov.iov_len  -= rv;
        } while (sent < size);
  
-       if (sock == mdev->data.socket)
-               drbd_clear_flag(mdev, NET_CONGESTED);
+       if (sock == tconn->data.socket)
+               clear_bit(NET_CONGESTED, &tconn->flags);
  
        if (rv <= 0) {
                if (rv != -EAGAIN) {
-                       dev_err(DEV, "%s_sendmsg returned %d\n",
-                           sock == mdev->meta.socket ? "msock" : "sock",
-                           rv);
-                       drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+                       conn_err(tconn, "%s_sendmsg returned %d\n",
+                                sock == tconn->meta.socket ? "msock" : "sock",
+                                rv);
+                       conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
                } else
-                       drbd_force_state(mdev, NS(conn, C_TIMEOUT));
+                       conn_request_state(tconn, NS(conn, C_TIMEOUT), CS_HARD);
        }
  
        return sent;
  }
  
+ /**
+  * drbd_send_all  -  Send an entire buffer
+  *
+  * Returns 0 upon success and a negative error value otherwise.
+  */
+ int drbd_send_all(struct drbd_tconn *tconn, struct socket *sock, void *buffer,
+                 size_t size, unsigned msg_flags)
+ {
+       int err;
+       err = drbd_send(tconn, sock, buffer, size, msg_flags);
+       if (err < 0)
+               return err;
+       if (err != size)
+               return -EIO;
+       return 0;
+ }
  static int drbd_open(struct block_device *bdev, fmode_t mode)
  {
        struct drbd_conf *mdev = bdev->bd_disk->private_data;
        int rv = 0;
  
        mutex_lock(&drbd_main_mutex);
-       spin_lock_irqsave(&mdev->req_lock, flags);
+       spin_lock_irqsave(&mdev->tconn->req_lock, flags);
        /* to have a stable mdev->state.role
         * and no race with updating open_cnt */
  
  
        if (!rv)
                mdev->open_cnt++;
-       spin_unlock_irqrestore(&mdev->req_lock, flags);
+       spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
        mutex_unlock(&drbd_main_mutex);
  
        return rv;
@@@ -3111,35 -1859,14 +1860,14 @@@ static int drbd_release(struct gendisk 
  
  static void drbd_set_defaults(struct drbd_conf *mdev)
  {
-       /* This way we get a compile error when sync_conf grows,
-          and we forgot to initialize it here */
-       mdev->sync_conf = (struct syncer_conf) {
-               /* .rate = */           DRBD_RATE_DEF,
-               /* .after = */          DRBD_AFTER_DEF,
-               /* .al_extents = */     DRBD_AL_EXTENTS_DEF,
-               /* .verify_alg = */     {}, 0,
-               /* .cpu_mask = */       {}, 0,
-               /* .csums_alg = */      {}, 0,
-               /* .use_rle = */        0,
-               /* .on_no_data = */     DRBD_ON_NO_DATA_DEF,
-               /* .c_plan_ahead = */   DRBD_C_PLAN_AHEAD_DEF,
-               /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
-               /* .c_fill_target = */  DRBD_C_FILL_TARGET_DEF,
-               /* .c_max_rate = */     DRBD_C_MAX_RATE_DEF,
-               /* .c_min_rate = */     DRBD_C_MIN_RATE_DEF
-       };
-       /* Have to use that way, because the layout differs between
-          big endian and little endian */
-       mdev->state = (union drbd_state) {
+       /* Beware! The actual layout differs
+        * between big endian and little endian */
+       mdev->state = (union drbd_dev_state) {
                { .role = R_SECONDARY,
                  .peer = R_UNKNOWN,
                  .conn = C_STANDALONE,
                  .disk = D_DISKLESS,
                  .pdsk = D_UNKNOWN,
-                 .susp = 0,
-                 .susp_nod = 0,
-                 .susp_fen = 0
                } };
  }
  
@@@ -3155,28 -1882,17 +1883,17 @@@ void drbd_init_set_defaults(struct drbd
        atomic_set(&mdev->rs_pending_cnt, 0);
        atomic_set(&mdev->unacked_cnt, 0);
        atomic_set(&mdev->local_cnt, 0);
-       atomic_set(&mdev->net_cnt, 0);
-       atomic_set(&mdev->packet_seq, 0);
-       atomic_set(&mdev->pp_in_use, 0);
        atomic_set(&mdev->pp_in_use_by_net, 0);
        atomic_set(&mdev->rs_sect_in, 0);
        atomic_set(&mdev->rs_sect_ev, 0);
        atomic_set(&mdev->ap_in_flight, 0);
        atomic_set(&mdev->md_io_in_use, 0);
  
-       mutex_init(&mdev->data.mutex);
-       mutex_init(&mdev->meta.mutex);
-       sema_init(&mdev->data.work.s, 0);
-       sema_init(&mdev->meta.work.s, 0);
-       mutex_init(&mdev->state_mutex);
-       spin_lock_init(&mdev->data.work.q_lock);
-       spin_lock_init(&mdev->meta.work.q_lock);
+       mutex_init(&mdev->own_state_mutex);
+       mdev->state_mutex = &mdev->own_state_mutex;
  
        spin_lock_init(&mdev->al_lock);
-       spin_lock_init(&mdev->req_lock);
        spin_lock_init(&mdev->peer_seq_lock);
-       spin_lock_init(&mdev->epoch_lock);
  
        INIT_LIST_HEAD(&mdev->active_ee);
        INIT_LIST_HEAD(&mdev->sync_ee);
        INIT_LIST_HEAD(&mdev->read_ee);
        INIT_LIST_HEAD(&mdev->net_ee);
        INIT_LIST_HEAD(&mdev->resync_reads);
-       INIT_LIST_HEAD(&mdev->data.work.q);
-       INIT_LIST_HEAD(&mdev->meta.work.q);
        INIT_LIST_HEAD(&mdev->resync_work.list);
        INIT_LIST_HEAD(&mdev->unplug_work.list);
        INIT_LIST_HEAD(&mdev->go_diskless.list);
        mdev->md_sync_work.cb = w_md_sync;
        mdev->bm_io_work.w.cb = w_bitmap_io;
        mdev->start_resync_work.cb = w_start_resync;
+       mdev->resync_work.mdev  = mdev;
+       mdev->unplug_work.mdev  = mdev;
+       mdev->go_diskless.mdev  = mdev;
+       mdev->md_sync_work.mdev = mdev;
+       mdev->bm_io_work.w.mdev = mdev;
+       mdev->start_resync_work.mdev = mdev;
        init_timer(&mdev->resync_timer);
        init_timer(&mdev->md_sync_timer);
        init_timer(&mdev->start_resync_timer);
  
        init_waitqueue_head(&mdev->misc_wait);
        init_waitqueue_head(&mdev->state_wait);
-       init_waitqueue_head(&mdev->net_cnt_wait);
        init_waitqueue_head(&mdev->ee_wait);
        init_waitqueue_head(&mdev->al_wait);
        init_waitqueue_head(&mdev->seq_wait);
  
-       drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
-       drbd_thread_init(mdev, &mdev->worker, drbd_worker);
-       drbd_thread_init(mdev, &mdev->asender, drbd_asender);
-       mdev->agreed_pro_version = PRO_VERSION_MAX;
-       mdev->write_ordering = WO_bdev_flush;
        mdev->resync_wenr = LC_FREE;
        mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
        mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
  void drbd_mdev_cleanup(struct drbd_conf *mdev)
  {
        int i;
-       if (mdev->receiver.t_state != None)
+       if (mdev->tconn->receiver.t_state != NONE)
                dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
-                               mdev->receiver.t_state);
+                               mdev->tconn->receiver.t_state);
  
-       /* no need to lock it, I'm the only thread alive */
-       if (atomic_read(&mdev->current_epoch->epoch_size) !=  0)
-               dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
        mdev->al_writ_cnt  =
        mdev->bm_writ_cnt  =
        mdev->read_cnt     =
                mdev->rs_mark_left[i] = 0;
                mdev->rs_mark_time[i] = 0;
        }
-       D_ASSERT(mdev->net_conf == NULL);
+       D_ASSERT(mdev->tconn->net_conf == NULL);
  
        drbd_set_my_capacity(mdev, 0);
        if (mdev->bitmap) {
                drbd_bm_cleanup(mdev);
        }
  
-       drbd_free_resources(mdev);
-       drbd_clear_flag(mdev, AL_SUSPENDED);
+       drbd_free_bc(mdev->ldev);
+       mdev->ldev = NULL;
+       clear_bit(AL_SUSPENDED, &mdev->flags);
  
-       /*
-        * currently we drbd_init_ee only on module load, so
-        * we may do drbd_release_ee only on module unload!
-        */
        D_ASSERT(list_empty(&mdev->active_ee));
        D_ASSERT(list_empty(&mdev->sync_ee));
        D_ASSERT(list_empty(&mdev->done_ee));
        D_ASSERT(list_empty(&mdev->read_ee));
        D_ASSERT(list_empty(&mdev->net_ee));
        D_ASSERT(list_empty(&mdev->resync_reads));
-       D_ASSERT(list_empty(&mdev->data.work.q));
-       D_ASSERT(list_empty(&mdev->meta.work.q));
+       D_ASSERT(list_empty(&mdev->tconn->sender_work.q));
        D_ASSERT(list_empty(&mdev->resync_work.list));
        D_ASSERT(list_empty(&mdev->unplug_work.list));
        D_ASSERT(list_empty(&mdev->go_diskless.list));
@@@ -3353,7 -2062,7 +2063,7 @@@ static int drbd_create_mempools(void
                goto Enomem;
  
        drbd_ee_cache = kmem_cache_create(
-               "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
+               "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
        if (drbd_ee_cache == NULL)
                goto Enomem;
  
                goto Enomem;
  
        /* mempools */
- #ifdef COMPAT_HAVE_BIOSET_CREATE
        drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
        if (drbd_md_io_bio_set == NULL)
                goto Enomem;
- #endif
  
        drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
        if (drbd_md_io_page_pool == NULL)
@@@ -3421,73 -2128,53 +2129,53 @@@ static struct notifier_block drbd_notif
        .notifier_call = drbd_notify_sys,
  };
  
- static void drbd_release_ee_lists(struct drbd_conf *mdev)
+ static void drbd_release_all_peer_reqs(struct drbd_conf *mdev)
  {
        int rr;
  
-       rr = drbd_release_ee(mdev, &mdev->active_ee);
+       rr = drbd_free_peer_reqs(mdev, &mdev->active_ee);
        if (rr)
                dev_err(DEV, "%d EEs in active list found!\n", rr);
  
-       rr = drbd_release_ee(mdev, &mdev->sync_ee);
+       rr = drbd_free_peer_reqs(mdev, &mdev->sync_ee);
        if (rr)
                dev_err(DEV, "%d EEs in sync list found!\n", rr);
  
-       rr = drbd_release_ee(mdev, &mdev->read_ee);
+       rr = drbd_free_peer_reqs(mdev, &mdev->read_ee);
        if (rr)
                dev_err(DEV, "%d EEs in read list found!\n", rr);
  
-       rr = drbd_release_ee(mdev, &mdev->done_ee);
+       rr = drbd_free_peer_reqs(mdev, &mdev->done_ee);
        if (rr)
                dev_err(DEV, "%d EEs in done list found!\n", rr);
  
-       rr = drbd_release_ee(mdev, &mdev->net_ee);
+       rr = drbd_free_peer_reqs(mdev, &mdev->net_ee);
        if (rr)
                dev_err(DEV, "%d EEs in net list found!\n", rr);
  }
  
- /* caution. no locking.
-  * currently only used from module cleanup code. */
- static void drbd_delete_device(unsigned int minor)
+ /* caution. no locking. */
+ void drbd_minor_destroy(struct kref *kref)
  {
-       struct drbd_conf *mdev = minor_to_mdev(minor);
-       if (!mdev)
-               return;
+       struct drbd_conf *mdev = container_of(kref, struct drbd_conf, kref);
+       struct drbd_tconn *tconn = mdev->tconn;
  
        del_timer_sync(&mdev->request_timer);
  
        /* paranoia asserts */
-       if (mdev->open_cnt != 0)
-               dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
-                               __FILE__ , __LINE__);
-       ERR_IF (!list_empty(&mdev->data.work.q)) {
-               struct list_head *lp;
-               list_for_each(lp, &mdev->data.work.q) {
-                       dev_err(DEV, "lp = %p\n", lp);
-               }
-       };
+       D_ASSERT(mdev->open_cnt == 0);
        /* end paranoia asserts */
  
-       del_gendisk(mdev->vdisk);
        /* cleanup stuff that may have been allocated during
         * device (re-)configuration or state changes */
  
        if (mdev->this_bdev)
                bdput(mdev->this_bdev);
  
-       drbd_free_resources(mdev);
-       drbd_release_ee_lists(mdev);
+       drbd_free_bc(mdev->ldev);
+       mdev->ldev = NULL;
  
-       /* should be freed on disconnect? */
-       kfree(mdev->ee_hash);
-       /*
-       mdev->ee_hash_s = 0;
-       mdev->ee_hash = NULL;
-       */
+       drbd_release_all_peer_reqs(mdev);
  
        lc_destroy(mdev->act_log);
        lc_destroy(mdev->resync);
        kfree(mdev->p_uuid);
        /* mdev->p_uuid = NULL; */
  
-       kfree(mdev->int_dig_out);
-       kfree(mdev->int_dig_in);
-       kfree(mdev->int_dig_vv);
+       if (mdev->bitmap) /* should no longer be there. */
+               drbd_bm_cleanup(mdev);
+       __free_page(mdev->md_io_page);
+       put_disk(mdev->vdisk);
+       blk_cleanup_queue(mdev->rq_queue);
+       kfree(mdev->rs_plan_s);
+       kfree(mdev);
+       kref_put(&tconn->kref, &conn_destroy);
+ }
+ /* One global retry thread, if we need to push back some bio and have it
+  * reinserted through our make request function.
+  */
+ static struct retry_worker {
+       struct workqueue_struct *wq;
+       struct work_struct worker;
+       spinlock_t lock;
+       struct list_head writes;
+ } retry;
+ static void do_retry(struct work_struct *ws)
+ {
+       struct retry_worker *retry = container_of(ws, struct retry_worker, worker);
+       LIST_HEAD(writes);
+       struct drbd_request *req, *tmp;
+       spin_lock_irq(&retry->lock);
+       list_splice_init(&retry->writes, &writes);
+       spin_unlock_irq(&retry->lock);
+       list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
+               struct drbd_conf *mdev = req->w.mdev;
+               struct bio *bio = req->master_bio;
+               unsigned long start_time = req->start_time;
+               bool expected;
+               expected = 
+                       expect(atomic_read(&req->completion_ref) == 0) &&
+                       expect(req->rq_state & RQ_POSTPONED) &&
+                       expect((req->rq_state & RQ_LOCAL_PENDING) == 0 ||
+                               (req->rq_state & RQ_LOCAL_ABORTED) != 0);
+               if (!expected)
+                       dev_err(DEV, "req=%p completion_ref=%d rq_state=%x\n",
+                               req, atomic_read(&req->completion_ref),
+                               req->rq_state);
+               /* We still need to put one kref associated with the
+                * "completion_ref" going zero in the code path that queued it
+                * here.  The request object may still be referenced by a
+                * frozen local req->private_bio, in case we force-detached.
+                */
+               kref_put(&req->kref, drbd_req_destroy);
+               /* A single suspended or otherwise blocking device may stall
+                * all others as well.  Fortunately, this code path is to
+                * recover from a situation that "should not happen":
+                * concurrent writes in multi-primary setup.
+                * In a "normal" lifecycle, this workqueue is supposed to be
+                * destroyed without ever doing anything.
+                * If it turns out to be an issue anyways, we can do per
+                * resource (replication group) or per device (minor) retry
+                * workqueues instead.
+                */
+               /* We are not just doing generic_make_request(),
+                * as we want to keep the start_time information. */
+               inc_ap_bio(mdev);
+               __drbd_make_request(mdev, bio, start_time);
+       }
+ }
+ void drbd_restart_request(struct drbd_request *req)
+ {
+       unsigned long flags;
+       spin_lock_irqsave(&retry.lock, flags);
+       list_move_tail(&req->tl_requests, &retry.writes);
+       spin_unlock_irqrestore(&retry.lock, flags);
+       /* Drop the extra reference that would otherwise
+        * have been dropped by complete_master_bio.
+        * do_retry() needs to grab a new one. */
+       dec_ap_bio(req->w.mdev);
  
-       /* cleanup the rest that has been
-        * allocated from drbd_new_device
-        * and actually free the mdev itself */
-       drbd_free_mdev(mdev);
+       queue_work(retry.wq, &retry.worker);
  }
  
  static void drbd_cleanup(void)
  {
        unsigned int i;
+       struct drbd_conf *mdev;
+       struct drbd_tconn *tconn, *tmp;
  
        unregister_reboot_notifier(&drbd_notifier);
  
        if (drbd_proc)
                remove_proc_entry("drbd", NULL);
  
-       drbd_nl_cleanup();
+       if (retry.wq)
+               destroy_workqueue(retry.wq);
  
-       if (minor_table) {
-               i = minor_count;
-               while (i--)
-                       drbd_delete_device(i);
-               drbd_destroy_mempools();
+       drbd_genl_unregister();
+       idr_for_each_entry(&minors, mdev, i) {
+               idr_remove(&minors, mdev_to_minor(mdev));
+               idr_remove(&mdev->tconn->volumes, mdev->vnr);
+               del_gendisk(mdev->vdisk);
+               /* synchronize_rcu(); No other threads running at this point */
+               kref_put(&mdev->kref, &drbd_minor_destroy);
        }
  
-       kfree(minor_table);
+       /* not _rcu since, no other updater anymore. Genl already unregistered */
+       list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) {
+               list_del(&tconn->all_tconn); /* not _rcu no proc, not other threads */
+               /* synchronize_rcu(); */
+               kref_put(&tconn->kref, &conn_destroy);
+       }
  
+       drbd_destroy_mempools();
        unregister_blkdev(DRBD_MAJOR, "drbd");
  
+       idr_destroy(&minors);
        printk(KERN_INFO "drbd: module cleanup done.\n");
  }
  
  /**
 - * drbd_congested() - Callback for pdflush
 + * drbd_congested() - Callback for the flusher thread
   * @congested_data:   User data
 - * @bdi_bits:         Bits pdflush is currently interested in
 + * @bdi_bits:         Bits the BDI flusher thread is currently interested in
   *
   * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
   */
@@@ -3559,7 -2340,7 +2341,7 @@@ static int drbd_congested(void *congest
                goto out;
        }
  
-       if (drbd_test_flag(mdev, CALLBACK_PENDING)) {
+       if (test_bit(CALLBACK_PENDING, &mdev->tconn->flags)) {
                r |= (1 << BDI_async_congested);
                /* Without good local data, we would need to read from remote,
                 * and that would need the worker thread as well, which is
                        reason = 'b';
        }
  
-       if (bdi_bits & (1 << BDI_async_congested) && drbd_test_flag(mdev, NET_CONGESTED)) {
+       if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->tconn->flags)) {
                r |= (1 << BDI_async_congested);
                reason = reason == 'b' ? 'a' : 'n';
        }
        return r;
  }
  
- struct drbd_conf *drbd_new_device(unsigned int minor)
+ static void drbd_init_workqueue(struct drbd_work_queue* wq)
+ {
+       spin_lock_init(&wq->q_lock);
+       INIT_LIST_HEAD(&wq->q);
+       init_waitqueue_head(&wq->q_wait);
+ }
+ struct drbd_tconn *conn_get_by_name(const char *name)
+ {
+       struct drbd_tconn *tconn;
+       if (!name || !name[0])
+               return NULL;
+       rcu_read_lock();
+       list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) {
+               if (!strcmp(tconn->name, name)) {
+                       kref_get(&tconn->kref);
+                       goto found;
+               }
+       }
+       tconn = NULL;
+ found:
+       rcu_read_unlock();
+       return tconn;
+ }
+ struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len,
+                                    void *peer_addr, int peer_addr_len)
+ {
+       struct drbd_tconn *tconn;
+       rcu_read_lock();
+       list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) {
+               if (tconn->my_addr_len == my_addr_len &&
+                   tconn->peer_addr_len == peer_addr_len &&
+                   !memcmp(&tconn->my_addr, my_addr, my_addr_len) &&
+                   !memcmp(&tconn->peer_addr, peer_addr, peer_addr_len)) {
+                       kref_get(&tconn->kref);
+                       goto found;
+               }
+       }
+       tconn = NULL;
+ found:
+       rcu_read_unlock();
+       return tconn;
+ }
+ static int drbd_alloc_socket(struct drbd_socket *socket)
+ {
+       socket->rbuf = (void *) __get_free_page(GFP_KERNEL);
+       if (!socket->rbuf)
+               return -ENOMEM;
+       socket->sbuf = (void *) __get_free_page(GFP_KERNEL);
+       if (!socket->sbuf)
+               return -ENOMEM;
+       return 0;
+ }
+ static void drbd_free_socket(struct drbd_socket *socket)
+ {
+       free_page((unsigned long) socket->sbuf);
+       free_page((unsigned long) socket->rbuf);
+ }
+ void conn_free_crypto(struct drbd_tconn *tconn)
+ {
+       drbd_free_sock(tconn);
+       crypto_free_hash(tconn->csums_tfm);
+       crypto_free_hash(tconn->verify_tfm);
+       crypto_free_hash(tconn->cram_hmac_tfm);
+       crypto_free_hash(tconn->integrity_tfm);
+       crypto_free_hash(tconn->peer_integrity_tfm);
+       kfree(tconn->int_dig_in);
+       kfree(tconn->int_dig_vv);
+       tconn->csums_tfm = NULL;
+       tconn->verify_tfm = NULL;
+       tconn->cram_hmac_tfm = NULL;
+       tconn->integrity_tfm = NULL;
+       tconn->peer_integrity_tfm = NULL;
+       tconn->int_dig_in = NULL;
+       tconn->int_dig_vv = NULL;
+ }
+ int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts)
+ {
+       cpumask_var_t new_cpu_mask;
+       int err;
+       if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL))
+               return -ENOMEM;
+               /*
+               retcode = ERR_NOMEM;
+               drbd_msg_put_info("unable to allocate cpumask");
+               */
+       /* silently ignore cpu mask on UP kernel */
+       if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
+               /* FIXME: Get rid of constant 32 here */
+               err = bitmap_parse(res_opts->cpu_mask, 32,
+                                  cpumask_bits(new_cpu_mask), nr_cpu_ids);
+               if (err) {
+                       conn_warn(tconn, "bitmap_parse() failed with %d\n", err);
+                       /* retcode = ERR_CPU_MASK_PARSE; */
+                       goto fail;
+               }
+       }
+       tconn->res_opts = *res_opts;
+       if (!cpumask_equal(tconn->cpu_mask, new_cpu_mask)) {
+               cpumask_copy(tconn->cpu_mask, new_cpu_mask);
+               drbd_calc_cpu_mask(tconn);
+               tconn->receiver.reset_cpu_mask = 1;
+               tconn->asender.reset_cpu_mask = 1;
+               tconn->worker.reset_cpu_mask = 1;
+       }
+       err = 0;
+ fail:
+       free_cpumask_var(new_cpu_mask);
+       return err;
+ }
+ /* caller must be under genl_lock() */
+ struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts)
+ {
+       struct drbd_tconn *tconn;
+       tconn = kzalloc(sizeof(struct drbd_tconn), GFP_KERNEL);
+       if (!tconn)
+               return NULL;
+       tconn->name = kstrdup(name, GFP_KERNEL);
+       if (!tconn->name)
+               goto fail;
+       if (drbd_alloc_socket(&tconn->data))
+               goto fail;
+       if (drbd_alloc_socket(&tconn->meta))
+               goto fail;
+       if (!zalloc_cpumask_var(&tconn->cpu_mask, GFP_KERNEL))
+               goto fail;
+       if (set_resource_options(tconn, res_opts))
+               goto fail;
+       tconn->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
+       if (!tconn->current_epoch)
+               goto fail;
+       INIT_LIST_HEAD(&tconn->transfer_log);
+       INIT_LIST_HEAD(&tconn->current_epoch->list);
+       tconn->epochs = 1;
+       spin_lock_init(&tconn->epoch_lock);
+       tconn->write_ordering = WO_bdev_flush;
+       tconn->send.seen_any_write_yet = false;
+       tconn->send.current_epoch_nr = 0;
+       tconn->send.current_epoch_writes = 0;
+       tconn->cstate = C_STANDALONE;
+       mutex_init(&tconn->cstate_mutex);
+       spin_lock_init(&tconn->req_lock);
+       mutex_init(&tconn->conf_update);
+       init_waitqueue_head(&tconn->ping_wait);
+       idr_init(&tconn->volumes);
+       drbd_init_workqueue(&tconn->sender_work);
+       mutex_init(&tconn->data.mutex);
+       mutex_init(&tconn->meta.mutex);
+       drbd_thread_init(tconn, &tconn->receiver, drbdd_init, "receiver");
+       drbd_thread_init(tconn, &tconn->worker, drbd_worker, "worker");
+       drbd_thread_init(tconn, &tconn->asender, drbd_asender, "asender");
+       kref_init(&tconn->kref);
+       list_add_tail_rcu(&tconn->all_tconn, &drbd_tconns);
+       return tconn;
+ fail:
+       kfree(tconn->current_epoch);
+       free_cpumask_var(tconn->cpu_mask);
+       drbd_free_socket(&tconn->meta);
+       drbd_free_socket(&tconn->data);
+       kfree(tconn->name);
+       kfree(tconn);
+       return NULL;
+ }
+ void conn_destroy(struct kref *kref)
+ {
+       struct drbd_tconn *tconn = container_of(kref, struct drbd_tconn, kref);
+       if (atomic_read(&tconn->current_epoch->epoch_size) !=  0)
+               conn_err(tconn, "epoch_size:%d\n", atomic_read(&tconn->current_epoch->epoch_size));
+       kfree(tconn->current_epoch);
+       idr_destroy(&tconn->volumes);
+       free_cpumask_var(tconn->cpu_mask);
+       drbd_free_socket(&tconn->meta);
+       drbd_free_socket(&tconn->data);
+       kfree(tconn->name);
+       kfree(tconn->int_dig_in);
+       kfree(tconn->int_dig_vv);
+       kfree(tconn);
+ }
+ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr)
  {
        struct drbd_conf *mdev;
        struct gendisk *disk;
        struct request_queue *q;
+       int vnr_got = vnr;
+       int minor_got = minor;
+       enum drbd_ret_code err = ERR_NOMEM;
+       mdev = minor_to_mdev(minor);
+       if (mdev)
+               return ERR_MINOR_EXISTS;
  
        /* GFP_KERNEL, we are outside of all write-out paths */
        mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
        if (!mdev)
-               return NULL;
-       if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
-               goto out_no_cpumask;
+               return ERR_NOMEM;
+       kref_get(&tconn->kref);
+       mdev->tconn = tconn;
  
        mdev->minor = minor;
+       mdev->vnr = vnr;
  
        drbd_init_set_defaults(mdev);
  
        blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
        blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
        blk_queue_merge_bvec(q, drbd_merge_bvec);
-       q->queue_lock = &mdev->req_lock;
+       q->queue_lock = &mdev->tconn->req_lock; /* needed since we use */
  
        mdev->md_io_page = alloc_page(GFP_KERNEL);
        if (!mdev->md_io_page)
  
        if (drbd_bm_init(mdev))
                goto out_no_bitmap;
-       /* no need to lock access, we are still initializing this minor device. */
-       if (!tl_init(mdev))
-               goto out_no_tl;
-       mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
-       if (!mdev->app_reads_hash)
-               goto out_no_app_reads;
-       mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
-       if (!mdev->current_epoch)
-               goto out_no_epoch;
-       INIT_LIST_HEAD(&mdev->current_epoch->list);
-       mdev->epochs = 1;
-       return mdev;
- /* out_whatever_else:
-       kfree(mdev->current_epoch); */
- out_no_epoch:
-       kfree(mdev->app_reads_hash);
- out_no_app_reads:
-       tl_cleanup(mdev);
- out_no_tl:
+       mdev->read_requests = RB_ROOT;
+       mdev->write_requests = RB_ROOT;
+       if (!idr_pre_get(&minors, GFP_KERNEL))
+               goto out_no_minor_idr;
+       if (idr_get_new_above(&minors, mdev, minor, &minor_got))
+               goto out_no_minor_idr;
+       if (minor_got != minor) {
+               err = ERR_MINOR_EXISTS;
+               drbd_msg_put_info("requested minor exists already");
+               goto out_idr_remove_minor;
+       }
+       if (!idr_pre_get(&tconn->volumes, GFP_KERNEL))
+               goto out_idr_remove_minor;
+       if (idr_get_new_above(&tconn->volumes, mdev, vnr, &vnr_got))
+               goto out_idr_remove_minor;
+       if (vnr_got != vnr) {
+               err = ERR_INVALID_REQUEST;
+               drbd_msg_put_info("requested volume exists already");
+               goto out_idr_remove_vol;
+       }
+       add_disk(disk);
+       kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */
+       /* inherit the connection state */
+       mdev->state.conn = tconn->cstate;
+       if (mdev->state.conn == C_WF_REPORT_PARAMS)
+               drbd_connected(mdev);
+       return NO_ERROR;
+ out_idr_remove_vol:
+       idr_remove(&tconn->volumes, vnr_got);
+ out_idr_remove_minor:
+       idr_remove(&minors, minor_got);
+       synchronize_rcu();
+ out_no_minor_idr:
        drbd_bm_cleanup(mdev);
  out_no_bitmap:
        __free_page(mdev->md_io_page);
@@@ -3684,55 -2702,25 +2703,25 @@@ out_no_io_page
  out_no_disk:
        blk_cleanup_queue(q);
  out_no_q:
-       free_cpumask_var(mdev->cpu_mask);
- out_no_cpumask:
-       kfree(mdev);
-       return NULL;
- }
- /* counterpart of drbd_new_device.
-  * last part of drbd_delete_device. */
- void drbd_free_mdev(struct drbd_conf *mdev)
- {
-       kfree(mdev->current_epoch);
-       kfree(mdev->app_reads_hash);
-       tl_cleanup(mdev);
-       if (mdev->bitmap) /* should no longer be there. */
-               drbd_bm_cleanup(mdev);
-       __free_page(mdev->md_io_page);
-       put_disk(mdev->vdisk);
-       blk_cleanup_queue(mdev->rq_queue);
-       free_cpumask_var(mdev->cpu_mask);
-       drbd_free_tl_hash(mdev);
        kfree(mdev);
+       kref_put(&tconn->kref, &conn_destroy);
+       return err;
  }
  
  int __init drbd_init(void)
  {
        int err;
  
-       if (sizeof(struct p_handshake) != 80) {
-               printk(KERN_ERR
-                      "drbd: never change the size or layout "
-                      "of the HandShake packet.\n");
-               return -EINVAL;
-       }
        if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
                printk(KERN_ERR
-                       "drbd: invalid minor_count (%d)\n", minor_count);
+                      "drbd: invalid minor_count (%d)\n", minor_count);
  #ifdef MODULE
                return -EINVAL;
  #else
-               minor_count = 8;
+               minor_count = DRBD_MINOR_COUNT_DEF;
  #endif
        }
  
-       err = drbd_nl_init();
-       if (err)
-               return err;
        err = register_blkdev(DRBD_MAJOR, "drbd");
        if (err) {
                printk(KERN_ERR
                return err;
        }
  
+       err = drbd_genl_register();
+       if (err) {
+               printk(KERN_ERR "drbd: unable to register generic netlink family\n");
+               goto fail;
+       }
        register_reboot_notifier(&drbd_notifier);
  
        /*
        init_waitqueue_head(&drbd_pp_wait);
  
        drbd_proc = NULL; /* play safe for drbd_cleanup */
-       minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
-                               GFP_KERNEL);
-       if (!minor_table)
-               goto Enomem;
+       idr_init(&minors);
  
        err = drbd_create_mempools();
        if (err)
-               goto Enomem;
+               goto fail;
  
        drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
        if (!drbd_proc) {
                printk(KERN_ERR "drbd: unable to register proc file\n");
-               goto Enomem;
+               goto fail;
        }
  
        rwlock_init(&global_state_lock);
+       INIT_LIST_HEAD(&drbd_tconns);
+       retry.wq = create_singlethread_workqueue("drbd-reissue");
+       if (!retry.wq) {
+               printk(KERN_ERR "drbd: unable to create retry workqueue\n");
+               goto fail;
+       }
+       INIT_WORK(&retry.worker, do_retry);
+       spin_lock_init(&retry.lock);
+       INIT_LIST_HEAD(&retry.writes);
  
        printk(KERN_INFO "drbd: initialized. "
               "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
        printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
        printk(KERN_INFO "drbd: registered as block device major %d\n",
                DRBD_MAJOR);
-       printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
  
        return 0; /* Success! */
  
Enomem:
fail:
        drbd_cleanup();
        if (err == -ENOMEM)
                /* currently always the case */
@@@ -3799,47 -2800,42 +2801,42 @@@ void drbd_free_bc(struct drbd_backing_d
        kfree(ldev);
  }
  
- void drbd_free_sock(struct drbd_conf *mdev)
+ void drbd_free_sock(struct drbd_tconn *tconn)
  {
-       if (mdev->data.socket) {
-               mutex_lock(&mdev->data.mutex);
-               kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
-               sock_release(mdev->data.socket);
-               mdev->data.socket = NULL;
-               mutex_unlock(&mdev->data.mutex);
+       if (tconn->data.socket) {
+               mutex_lock(&tconn->data.mutex);
+               kernel_sock_shutdown(tconn->data.socket, SHUT_RDWR);
+               sock_release(tconn->data.socket);
+               tconn->data.socket = NULL;
+               mutex_unlock(&tconn->data.mutex);
        }
-       if (mdev->meta.socket) {
-               mutex_lock(&mdev->meta.mutex);
-               kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
-               sock_release(mdev->meta.socket);
-               mdev->meta.socket = NULL;
-               mutex_unlock(&mdev->meta.mutex);
+       if (tconn->meta.socket) {
+               mutex_lock(&tconn->meta.mutex);
+               kernel_sock_shutdown(tconn->meta.socket, SHUT_RDWR);
+               sock_release(tconn->meta.socket);
+               tconn->meta.socket = NULL;
+               mutex_unlock(&tconn->meta.mutex);
        }
  }
  
+ /* meta data management */
  
- void drbd_free_resources(struct drbd_conf *mdev)
+ void conn_md_sync(struct drbd_tconn *tconn)
  {
-       crypto_free_hash(mdev->csums_tfm);
-       mdev->csums_tfm = NULL;
-       crypto_free_hash(mdev->verify_tfm);
-       mdev->verify_tfm = NULL;
-       crypto_free_hash(mdev->cram_hmac_tfm);
-       mdev->cram_hmac_tfm = NULL;
-       crypto_free_hash(mdev->integrity_w_tfm);
-       mdev->integrity_w_tfm = NULL;
-       crypto_free_hash(mdev->integrity_r_tfm);
-       mdev->integrity_r_tfm = NULL;
-       drbd_free_sock(mdev);
+       struct drbd_conf *mdev;
+       int vnr;
  
-       __no_warn(local,
-                 drbd_free_bc(mdev->ldev);
-                 mdev->ldev = NULL;);
+       rcu_read_lock();
+       idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+               kref_get(&mdev->kref);
+               rcu_read_unlock();
+               drbd_md_sync(mdev);
+               kref_put(&mdev->kref, &drbd_minor_destroy);
+               rcu_read_lock();
+       }
+       rcu_read_unlock();
  }
  
- /* meta data management */
  struct meta_data_on_disk {
        u64 la_size;           /* last agreed size. */
        u64 uuid[UI_SIZE];   /* UUIDs. */
        u32 md_size_sect;
        u32 al_offset;         /* offset to this block */
        u32 al_nr_extents;     /* important for restoring the AL */
-             /* `-- act_log->nr_elements <-- sync_conf.al_extents */
+             /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
        u32 bm_offset;         /* offset to the bitmap, from here */
        u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
        u32 la_peer_max_bio_size;   /* last peer max_bio_size */
@@@ -3870,7 -2866,7 +2867,7 @@@ void drbd_md_sync(struct drbd_conf *mde
  
        del_timer(&mdev->md_sync_timer);
        /* timer may be rearmed by drbd_md_mark_dirty() now. */
-       if (!drbd_test_and_clear_flag(mdev, MD_DIRTY))
+       if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
                return;
  
        /* We use here D_FAILED and not D_ATTACHING because we try to write
        for (i = UI_CURRENT; i < UI_SIZE; i++)
                buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
        buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
-       buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
+       buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN);
  
        buffer->md_size_sect  = cpu_to_be32(mdev->ldev->md.md_size_sect);
        buffer->al_offset     = cpu_to_be32(mdev->ldev->md.al_offset);
        D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
        sector = mdev->ldev->md.md_offset;
  
-       if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
+       if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
                /* this was a try anyways ... */
                dev_err(DEV, "meta data update failed!\n");
                drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
   * @bdev:     Device from which the meta data should be read in.
   *
   * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
-  * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
+  * something goes wrong.
   */
  int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
  {
        struct meta_data_on_disk *buffer;
+       u32 magic, flags;
        int i, rv = NO_ERROR;
  
        if (!get_ldev_if_state(mdev, D_ATTACHING))
        if (!buffer)
                goto out;
  
-       if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
+       if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
                /* NOTE: can't do normal error processing here as this is
                   called BEFORE disk is attached */
                dev_err(DEV, "Error while reading metadata.\n");
                goto err;
        }
  
-       if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
-               dev_err(DEV, "Error while reading metadata, magic not found.\n");
+       magic = be32_to_cpu(buffer->magic);
+       flags = be32_to_cpu(buffer->flags);
+       if (magic == DRBD_MD_MAGIC_84_UNCLEAN ||
+           (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) {
+                       /* btw: that's Activity Log clean, not "all" clean. */
+               dev_err(DEV, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n");
+               rv = ERR_MD_UNCLEAN;
+               goto err;
+       }
+       if (magic != DRBD_MD_MAGIC_08) {
+               if (magic == DRBD_MD_MAGIC_07)
+                       dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
+               else
+                       dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
                rv = ERR_MD_INVALID;
                goto err;
        }
        for (i = UI_CURRENT; i < UI_SIZE; i++)
                bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
        bdev->md.flags = be32_to_cpu(buffer->flags);
-       mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
        bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
  
-       spin_lock_irq(&mdev->req_lock);
+       spin_lock_irq(&mdev->tconn->req_lock);
        if (mdev->state.conn < C_CONNECTED) {
 -              int peer;
 +              unsigned int peer;
                peer = be32_to_cpu(buffer->la_peer_max_bio_size);
 -              peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
 +              peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
                mdev->peer_max_bio_size = peer;
        }
-       spin_unlock_irq(&mdev->req_lock);
-       if (mdev->sync_conf.al_extents < 7)
-               mdev->sync_conf.al_extents = 127;
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
   err:
        drbd_md_put_buffer(mdev);
  #ifdef DEBUG
  void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
  {
-       if (!drbd_test_and_set_flag(mdev, MD_DIRTY)) {
+       if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
                mod_timer(&mdev->md_sync_timer, jiffies + HZ);
                mdev->last_md_mark_dirty.line = line;
                mdev->last_md_mark_dirty.func = func;
  #else
  void drbd_md_mark_dirty(struct drbd_conf *mdev)
  {
-       if (!drbd_test_and_set_flag(mdev, MD_DIRTY))
+       if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
                mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
  }
  #endif
@@@ -4171,9 -3176,10 +3177,10 @@@ int drbd_bmio_clear_n_write(struct drbd
        return rv;
  }
  
- static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+ static int w_bitmap_io(struct drbd_work *w, int unused)
  {
        struct bm_io_work *work = container_of(w, struct bm_io_work, w);
+       struct drbd_conf *mdev = w->mdev;
        int rv = -EIO;
  
        D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
                put_ldev(mdev);
        }
  
-       drbd_clear_flag(mdev, BITMAP_IO);
-       smp_mb__after_clear_bit();
+       clear_bit_unlock(BITMAP_IO, &mdev->flags);
        wake_up(&mdev->misc_wait);
  
        if (work->done)
                work->done(mdev, rv);
  
-       drbd_clear_flag(mdev, BITMAP_IO_QUEUED);
+       clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
        work->why = NULL;
        work->flags = 0;
  
-       return 1;
+       return 0;
  }
  
  void drbd_ldev_destroy(struct drbd_conf *mdev)
                drbd_free_bc(mdev->ldev);
                mdev->ldev = NULL;);
  
-       if (mdev->md_io_tmpp) {
-               __free_page(mdev->md_io_tmpp);
-               mdev->md_io_tmpp = NULL;
-       }
-       drbd_clear_flag(mdev, GO_DISKLESS);
+       clear_bit(GO_DISKLESS, &mdev->flags);
  }
  
- static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+ static int w_go_diskless(struct drbd_work *w, int unused)
  {
+       struct drbd_conf *mdev = w->mdev;
        D_ASSERT(mdev->state.disk == D_FAILED);
        /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
         * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
         * (Do we want a specific meta data flag for this?)
         *
         * If that does not make it to stable storage either,
-        * we cannot do anything about that anymore.  */
-       if (mdev->bitmap) {
+        * we cannot do anything about that anymore.
+        *
+        * We still need to check if both bitmap and ldev are present, we may
+        * end up here after a failed attach, before ldev was even assigned.
+        */
+       if (mdev->bitmap && mdev->ldev) {
                if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write,
                                        "detach", BM_LOCKED_MASK)) {
-                       if (drbd_test_flag(mdev, WAS_READ_ERROR)) {
+                       if (test_bit(WAS_READ_ERROR, &mdev->flags)) {
                                drbd_md_set_flag(mdev, MDF_FULL_SYNC);
                                drbd_md_sync(mdev);
                        }
        }
  
        drbd_force_state(mdev, NS(disk, D_DISKLESS));
-       return 1;
+       return 0;
  }
  
  void drbd_go_diskless(struct drbd_conf *mdev)
  {
        D_ASSERT(mdev->state.disk == D_FAILED);
-       if (!drbd_test_and_set_flag(mdev, GO_DISKLESS))
-               drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
+       if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
+               drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless);
  }
  
  /**
@@@ -4271,10 -3278,10 +3279,10 @@@ void drbd_queue_bitmap_io(struct drbd_c
                          void (*done)(struct drbd_conf *, int),
                          char *why, enum bm_flag flags)
  {
-       D_ASSERT(current == mdev->worker.task);
+       D_ASSERT(current == mdev->tconn->worker.task);
  
-       D_ASSERT(!drbd_test_flag(mdev, BITMAP_IO_QUEUED));
-       D_ASSERT(!drbd_test_flag(mdev, BITMAP_IO));
+       D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
+       D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
        D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
        if (mdev->bm_io_work.why)
                dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
        mdev->bm_io_work.why = why;
        mdev->bm_io_work.flags = flags;
  
-       spin_lock_irq(&mdev->req_lock);
-       drbd_set_flag(mdev, BITMAP_IO);
+       spin_lock_irq(&mdev->tconn->req_lock);
+       set_bit(BITMAP_IO, &mdev->flags);
        if (atomic_read(&mdev->ap_bio_cnt) == 0) {
-               if (!drbd_test_and_set_flag(mdev, BITMAP_IO_QUEUED))
-                       drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+               if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
+                       drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w);
        }
-       spin_unlock_irq(&mdev->req_lock);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  }
  
  /**
@@@ -4308,7 -3315,7 +3316,7 @@@ int drbd_bitmap_io(struct drbd_conf *md
  {
        int rv;
  
-       D_ASSERT(current != mdev->worker.task);
+       D_ASSERT(current != mdev->tconn->worker.task);
  
        if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
                drbd_suspend_io(mdev);
@@@ -4347,18 -3354,127 +3355,127 @@@ static void md_sync_timer_fn(unsigned l
  {
        struct drbd_conf *mdev = (struct drbd_conf *) data;
  
-       drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
+       /* must not double-queue! */
+       if (list_empty(&mdev->md_sync_work.list))
+               drbd_queue_work_front(&mdev->tconn->sender_work, &mdev->md_sync_work);
  }
  
- static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+ static int w_md_sync(struct drbd_work *w, int unused)
  {
+       struct drbd_conf *mdev = w->mdev;
        dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
  #ifdef DEBUG
        dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
                mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
  #endif
        drbd_md_sync(mdev);
-       return 1;
+       return 0;
+ }
+ const char *cmdname(enum drbd_packet cmd)
+ {
+       /* THINK may need to become several global tables
+        * when we want to support more than
+        * one PRO_VERSION */
+       static const char *cmdnames[] = {
+               [P_DATA]                = "Data",
+               [P_DATA_REPLY]          = "DataReply",
+               [P_RS_DATA_REPLY]       = "RSDataReply",
+               [P_BARRIER]             = "Barrier",
+               [P_BITMAP]              = "ReportBitMap",
+               [P_BECOME_SYNC_TARGET]  = "BecomeSyncTarget",
+               [P_BECOME_SYNC_SOURCE]  = "BecomeSyncSource",
+               [P_UNPLUG_REMOTE]       = "UnplugRemote",
+               [P_DATA_REQUEST]        = "DataRequest",
+               [P_RS_DATA_REQUEST]     = "RSDataRequest",
+               [P_SYNC_PARAM]          = "SyncParam",
+               [P_SYNC_PARAM89]        = "SyncParam89",
+               [P_PROTOCOL]            = "ReportProtocol",
+               [P_UUIDS]               = "ReportUUIDs",
+               [P_SIZES]               = "ReportSizes",
+               [P_STATE]               = "ReportState",
+               [P_SYNC_UUID]           = "ReportSyncUUID",
+               [P_AUTH_CHALLENGE]      = "AuthChallenge",
+               [P_AUTH_RESPONSE]       = "AuthResponse",
+               [P_PING]                = "Ping",
+               [P_PING_ACK]            = "PingAck",
+               [P_RECV_ACK]            = "RecvAck",
+               [P_WRITE_ACK]           = "WriteAck",
+               [P_RS_WRITE_ACK]        = "RSWriteAck",
+               [P_SUPERSEDED]          = "Superseded",
+               [P_NEG_ACK]             = "NegAck",
+               [P_NEG_DREPLY]          = "NegDReply",
+               [P_NEG_RS_DREPLY]       = "NegRSDReply",
+               [P_BARRIER_ACK]         = "BarrierAck",
+               [P_STATE_CHG_REQ]       = "StateChgRequest",
+               [P_STATE_CHG_REPLY]     = "StateChgReply",
+               [P_OV_REQUEST]          = "OVRequest",
+               [P_OV_REPLY]            = "OVReply",
+               [P_OV_RESULT]           = "OVResult",
+               [P_CSUM_RS_REQUEST]     = "CsumRSRequest",
+               [P_RS_IS_IN_SYNC]       = "CsumRSIsInSync",
+               [P_COMPRESSED_BITMAP]   = "CBitmap",
+               [P_DELAY_PROBE]         = "DelayProbe",
+               [P_OUT_OF_SYNC]         = "OutOfSync",
+               [P_RETRY_WRITE]         = "RetryWrite",
+               [P_RS_CANCEL]           = "RSCancel",
+               [P_CONN_ST_CHG_REQ]     = "conn_st_chg_req",
+               [P_CONN_ST_CHG_REPLY]   = "conn_st_chg_reply",
+               [P_RETRY_WRITE]         = "retry_write",
+               [P_PROTOCOL_UPDATE]     = "protocol_update",
+               /* enum drbd_packet, but not commands - obsoleted flags:
+                *      P_MAY_IGNORE
+                *      P_MAX_OPT_CMD
+                */
+       };
+       /* too big for the array: 0xfffX */
+       if (cmd == P_INITIAL_META)
+               return "InitialMeta";
+       if (cmd == P_INITIAL_DATA)
+               return "InitialData";
+       if (cmd == P_CONNECTION_FEATURES)
+               return "ConnectionFeatures";
+       if (cmd >= ARRAY_SIZE(cmdnames))
+               return "Unknown";
+       return cmdnames[cmd];
+ }
+ /**
+  * drbd_wait_misc  -  wait for a request to make progress
+  * @mdev:     device associated with the request
+  * @i:                the struct drbd_interval embedded in struct drbd_request or
+  *            struct drbd_peer_request
+  */
+ int drbd_wait_misc(struct drbd_conf *mdev, struct drbd_interval *i)
+ {
+       struct net_conf *nc;
+       DEFINE_WAIT(wait);
+       long timeout;
+       rcu_read_lock();
+       nc = rcu_dereference(mdev->tconn->net_conf);
+       if (!nc) {
+               rcu_read_unlock();
+               return -ETIMEDOUT;
+       }
+       timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT;
+       rcu_read_unlock();
+       /* Indicate to wake up mdev->misc_wait on progress.  */
+       i->waiting = true;
+       prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE);
+       spin_unlock_irq(&mdev->tconn->req_lock);
+       timeout = schedule_timeout(timeout);
+       finish_wait(&mdev->misc_wait, &wait);
+       spin_lock_irq(&mdev->tconn->req_lock);
+       if (!timeout || mdev->state.conn < C_CONNECTED)
+               return -ETIMEDOUT;
+       if (signal_pending(current))
+               return -ERESTARTSYS;
+       return 0;
  }
  
  #ifdef CONFIG_DRBD_FAULT_INJECTION
@@@ -4439,11 -3555,12 +3556,11 @@@ const char *drbd_buildtag(void
        static char buildtag[38] = "\0uilt-in";
  
        if (buildtag[0] == 0) {
 -#ifdef CONFIG_MODULES
 -              if (THIS_MODULE != NULL)
 -                      sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
 -              else
 +#ifdef MODULE
 +              sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
 +#else
 +              buildtag[0] = 'b';
  #endif
 -                      buildtag[0] = 'b';
        }
  
        return buildtag;
index c8dda4e8dfce0303d787bccad29fbe193bf8e43f,d339a2754a8581c811271d95ea14b196d436d907..76bb3a684b86e5c38bec24db5679b5b2760ae83b
  #include <linux/fs.h>
  #include <linux/file.h>
  #include <linux/slab.h>
- #include <linux/connector.h>
  #include <linux/blkpg.h>
  #include <linux/cpumask.h>
  #include "drbd_int.h"
  #include "drbd_req.h"
  #include "drbd_wrappers.h"
  #include <asm/unaligned.h>
- #include <linux/drbd_tag_magic.h>
  #include <linux/drbd_limits.h>
- #include <linux/compiler.h>
  #include <linux/kthread.h>
  
- static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
- static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
- static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *);
- /* see get_sb_bdev and bd_claim */
+ #include <net/genetlink.h>
+ /* .doit */
+ // int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info);
+ // int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
+ /* .dumpit */
+ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
+ #include <linux/drbd_genl_api.h>
+ #include "drbd_nla.h"
+ #include <linux/genl_magic_func.h>
+ /* used blkdev_get_by_path, to claim our meta data device(s) */
  static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
  
- /* Generate the tag_list to struct functions */
- #define NL_PACKET(name, number, fields) \
- static int name ## _from_tags(struct drbd_conf *mdev, \
-       unsigned short *tags, struct name *arg) __attribute__ ((unused)); \
- static int name ## _from_tags(struct drbd_conf *mdev, \
-       unsigned short *tags, struct name *arg) \
- { \
-       int tag; \
-       int dlen; \
-       \
-       while ((tag = get_unaligned(tags++)) != TT_END) {       \
-               dlen = get_unaligned(tags++);                   \
-               switch (tag_number(tag)) { \
-               fields \
-               default: \
-                       if (tag & T_MANDATORY) { \
-                               dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \
-                               return 0; \
-                       } \
-               } \
-               tags = (unsigned short *)((char *)tags + dlen); \
-       } \
-       return 1; \
- }
- #define NL_INTEGER(pn, pr, member) \
-       case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \
-               arg->member = get_unaligned((int *)(tags));     \
-               break;
- #define NL_INT64(pn, pr, member) \
-       case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \
-               arg->member = get_unaligned((u64 *)(tags));     \
+ /* Configuration is strictly serialized, because generic netlink message
+  * processing is strictly serialized by the genl_lock().
+  * Which means we can use one static global drbd_config_context struct.
+  */
+ static struct drbd_config_context {
+       /* assigned from drbd_genlmsghdr */
+       unsigned int minor;
+       /* assigned from request attributes, if present */
+       unsigned int volume;
+ #define VOLUME_UNSPECIFIED            (-1U)
+       /* pointer into the request skb,
+        * limited lifetime! */
+       char *resource_name;
+       struct nlattr *my_addr;
+       struct nlattr *peer_addr;
+       /* reply buffer */
+       struct sk_buff *reply_skb;
+       /* pointer into reply buffer */
+       struct drbd_genlmsghdr *reply_dh;
+       /* resolved from attributes, if possible */
+       struct drbd_conf *mdev;
+       struct drbd_tconn *tconn;
+ } adm_ctx;
+ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
+ {
+       genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
+       if (genlmsg_reply(skb, info))
+               printk(KERN_ERR "drbd: error sending genl reply\n");
+ }
+ /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
+  * reason it could fail was no space in skb, and there are 4k available. */
+ int drbd_msg_put_info(const char *info)
+ {
+       struct sk_buff *skb = adm_ctx.reply_skb;
+       struct nlattr *nla;
+       int err = -EMSGSIZE;
+       if (!info || !info[0])
+               return 0;
+       nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY);
+       if (!nla)
+               return err;
+       err = nla_put_string(skb, T_info_text, info);
+       if (err) {
+               nla_nest_cancel(skb, nla);
+               return err;
+       } else
+               nla_nest_end(skb, nla);
+       return 0;
+ }
+ /* This would be a good candidate for a "pre_doit" hook,
+  * and per-family private info->pointers.
+  * But we need to stay compatible with older kernels.
+  * If it returns successfully, adm_ctx members are valid.
+  */
+ #define DRBD_ADM_NEED_MINOR   1
+ #define DRBD_ADM_NEED_RESOURCE        2
+ #define DRBD_ADM_NEED_CONNECTION 4
+ static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info,
+               unsigned flags)
+ {
+       struct drbd_genlmsghdr *d_in = info->userhdr;
+       const u8 cmd = info->genlhdr->cmd;
+       int err;
+       memset(&adm_ctx, 0, sizeof(adm_ctx));
+       /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
 -      if (cmd != DRBD_ADM_GET_STATUS
 -      && security_netlink_recv(skb, CAP_SYS_ADMIN))
++      if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
+              return -EPERM;
+       adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+       if (!adm_ctx.reply_skb) {
+               err = -ENOMEM;
+               goto fail;
+       }
+       adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb,
+                                       info, &drbd_genl_family, 0, cmd);
+       /* put of a few bytes into a fresh skb of >= 4k will always succeed.
+        * but anyways */
+       if (!adm_ctx.reply_dh) {
+               err = -ENOMEM;
+               goto fail;
+       }
+       adm_ctx.reply_dh->minor = d_in->minor;
+       adm_ctx.reply_dh->ret_code = NO_ERROR;
+       adm_ctx.volume = VOLUME_UNSPECIFIED;
+       if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
+               struct nlattr *nla;
+               /* parse and validate only */
+               err = drbd_cfg_context_from_attrs(NULL, info);
+               if (err)
+                       goto fail;
+               /* It was present, and valid,
+                * copy it over to the reply skb. */
+               err = nla_put_nohdr(adm_ctx.reply_skb,
+                               info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
+                               info->attrs[DRBD_NLA_CFG_CONTEXT]);
+               if (err)
+                       goto fail;
+               /* and assign stuff to the global adm_ctx */
+               nla = nested_attr_tb[__nla_type(T_ctx_volume)];
+               if (nla)
+                       adm_ctx.volume = nla_get_u32(nla);
+               nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
+               if (nla)
+                       adm_ctx.resource_name = nla_data(nla);
+               adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
+               adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
+               if ((adm_ctx.my_addr &&
+                    nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.tconn->my_addr)) ||
+                   (adm_ctx.peer_addr &&
+                    nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.tconn->peer_addr))) {
+                       err = -EINVAL;
+                       goto fail;
+               }
+       }
+       adm_ctx.minor = d_in->minor;
+       adm_ctx.mdev = minor_to_mdev(d_in->minor);
+       adm_ctx.tconn = conn_get_by_name(adm_ctx.resource_name);
+       if (!adm_ctx.mdev && (flags & DRBD_ADM_NEED_MINOR)) {
+               drbd_msg_put_info("unknown minor");
+               return ERR_MINOR_INVALID;
+       }
+       if (!adm_ctx.tconn && (flags & DRBD_ADM_NEED_RESOURCE)) {
+               drbd_msg_put_info("unknown resource");
+               return ERR_INVALID_REQUEST;
+       }
+       if (flags & DRBD_ADM_NEED_CONNECTION) {
+               if (adm_ctx.tconn && !(flags & DRBD_ADM_NEED_RESOURCE)) {
+                       drbd_msg_put_info("no resource name expected");
+                       return ERR_INVALID_REQUEST;
+               }
+               if (adm_ctx.mdev) {
+                       drbd_msg_put_info("no minor number expected");
+                       return ERR_INVALID_REQUEST;
+               }
+               if (adm_ctx.my_addr && adm_ctx.peer_addr)
+                       adm_ctx.tconn = conn_get_by_addrs(nla_data(adm_ctx.my_addr),
+                                                         nla_len(adm_ctx.my_addr),
+                                                         nla_data(adm_ctx.peer_addr),
+                                                         nla_len(adm_ctx.peer_addr));
+               if (!adm_ctx.tconn) {
+                       drbd_msg_put_info("unknown connection");
+                       return ERR_INVALID_REQUEST;
+               }
+       }
+       /* some more paranoia, if the request was over-determined */
+       if (adm_ctx.mdev && adm_ctx.tconn &&
+           adm_ctx.mdev->tconn != adm_ctx.tconn) {
+               pr_warning("request: minor=%u, resource=%s; but that minor belongs to connection %s\n",
+                               adm_ctx.minor, adm_ctx.resource_name,
+                               adm_ctx.mdev->tconn->name);
+               drbd_msg_put_info("minor exists in different resource");
+               return ERR_INVALID_REQUEST;
+       }
+       if (adm_ctx.mdev &&
+           adm_ctx.volume != VOLUME_UNSPECIFIED &&
+           adm_ctx.volume != adm_ctx.mdev->vnr) {
+               pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
+                               adm_ctx.minor, adm_ctx.volume,
+                               adm_ctx.mdev->vnr, adm_ctx.mdev->tconn->name);
+               drbd_msg_put_info("minor exists as different volume");
+               return ERR_INVALID_REQUEST;
+       }
+       return NO_ERROR;
+ fail:
+       nlmsg_free(adm_ctx.reply_skb);
+       adm_ctx.reply_skb = NULL;
+       return err;
+ }
+ static int drbd_adm_finish(struct genl_info *info, int retcode)
+ {
+       if (adm_ctx.tconn) {
+               kref_put(&adm_ctx.tconn->kref, &conn_destroy);
+               adm_ctx.tconn = NULL;
+       }
+       if (!adm_ctx.reply_skb)
+               return -ENOMEM;
+       adm_ctx.reply_dh->ret_code = retcode;
+       drbd_adm_send_reply(adm_ctx.reply_skb, info);
+       return 0;
+ }
+ static void setup_khelper_env(struct drbd_tconn *tconn, char **envp)
+ {
+       char *afs;
+       /* FIXME: A future version will not allow this case. */
+       if (tconn->my_addr_len == 0 || tconn->peer_addr_len == 0)
+               return;
+       switch (((struct sockaddr *)&tconn->peer_addr)->sa_family) {
+       case AF_INET6:
+               afs = "ipv6";
+               snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6",
+                        &((struct sockaddr_in6 *)&tconn->peer_addr)->sin6_addr);
                break;
- #define NL_BIT(pn, pr, member) \
-       case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \
-               arg->member = *(char *)(tags) ? 1 : 0; \
+       case AF_INET:
+               afs = "ipv4";
+               snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+                        &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr);
                break;
- #define NL_STRING(pn, pr, member, len) \
-       case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
-               if (dlen > len) { \
-                       dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \
-                               #member, dlen, (unsigned int)len); \
-                       return 0; \
-               } \
-                arg->member ## _len = dlen; \
-                memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
-                break;
- #include <linux/drbd_nl.h>
- /* Generate the struct to tag_list functions */
- #define NL_PACKET(name, number, fields) \
- static unsigned short* \
- name ## _to_tags(struct drbd_conf *mdev, \
-       struct name *arg, unsigned short *tags) __attribute__ ((unused)); \
- static unsigned short* \
- name ## _to_tags(struct drbd_conf *mdev, \
-       struct name *arg, unsigned short *tags) \
- { \
-       fields \
-       return tags; \
- }
- #define NL_INTEGER(pn, pr, member) \
-       put_unaligned(pn | pr | TT_INTEGER, tags++);    \
-       put_unaligned(sizeof(int), tags++);             \
-       put_unaligned(arg->member, (int *)tags);        \
-       tags = (unsigned short *)((char *)tags+sizeof(int));
- #define NL_INT64(pn, pr, member) \
-       put_unaligned(pn | pr | TT_INT64, tags++);      \
-       put_unaligned(sizeof(u64), tags++);             \
-       put_unaligned(arg->member, (u64 *)tags);        \
-       tags = (unsigned short *)((char *)tags+sizeof(u64));
- #define NL_BIT(pn, pr, member) \
-       put_unaligned(pn | pr | TT_BIT, tags++);        \
-       put_unaligned(sizeof(char), tags++);            \
-       *(char *)tags = arg->member; \
-       tags = (unsigned short *)((char *)tags+sizeof(char));
- #define NL_STRING(pn, pr, member, len) \
-       put_unaligned(pn | pr | TT_STRING, tags++);     \
-       put_unaligned(arg->member ## _len, tags++);     \
-       memcpy(tags, arg->member, arg->member ## _len); \
-       tags = (unsigned short *)((char *)tags + arg->member ## _len);
- #include <linux/drbd_nl.h>
- void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
- void drbd_nl_send_reply(struct cn_msg *, int);
+       default:
+               afs = "ssocks";
+               snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+                        &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr);
+       }
+       snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs);
+ }
  
  int drbd_khelper(struct drbd_conf *mdev, char *cmd)
  {
        char *envp[] = { "HOME=/",
                        "TERM=linux",
                        "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
-                       NULL, /* Will be set to address family */
-                       NULL, /* Will be set to address */
+                        (char[20]) { }, /* address family */
+                        (char[60]) { }, /* address */
                        NULL };
-       char mb[12], af[20], ad[60], *afs;
+       char mb[12];
        char *argv[] = {usermode_helper, cmd, mb, NULL };
+       struct drbd_tconn *tconn = mdev->tconn;
+       struct sib_info sib;
        int ret;
  
-       if (current == mdev->worker.task)
-               drbd_set_flag(mdev, CALLBACK_PENDING);
+       if (current == tconn->worker.task)
+               set_bit(CALLBACK_PENDING, &tconn->flags);
  
        snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
-       if (get_net_conf(mdev)) {
-               switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) {
-               case AF_INET6:
-                       afs = "ipv6";
-                       snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6",
-                                &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr);
-                       break;
-               case AF_INET:
-                       afs = "ipv4";
-                       snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
-                                &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
-                       break;
-               default:
-                       afs = "ssocks";
-                       snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
-                                &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
-               }
-               snprintf(af, 20, "DRBD_PEER_AF=%s", afs);
-               envp[3]=af;
-               envp[4]=ad;
-               put_net_conf(mdev);
-       }
+       setup_khelper_env(tconn, envp);
  
        /* The helper may take some time.
         * write out any unsynced meta data changes now */
        drbd_md_sync(mdev);
  
        dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
-       drbd_bcast_ev_helper(mdev, cmd);
+       sib.sib_reason = SIB_HELPER_PRE;
+       sib.helper_name = cmd;
+       drbd_bcast_event(mdev, &sib);
 -      ret = call_usermodehelper(usermode_helper, argv, envp, 1);
 +      ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
        if (ret)
                dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
                                usermode_helper, cmd, mb,
                dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
                                usermode_helper, cmd, mb,
                                (ret >> 8) & 0xff, ret);
+       sib.sib_reason = SIB_HELPER_POST;
+       sib.helper_exit_code = ret;
+       drbd_bcast_event(mdev, &sib);
+       if (current == tconn->worker.task)
+               clear_bit(CALLBACK_PENDING, &tconn->flags);
  
-       if (current == mdev->worker.task)
-               drbd_clear_flag(mdev, CALLBACK_PENDING);
+       if (ret < 0) /* Ignore any ERRNOs we got. */
+               ret = 0;
+       return ret;
+ }
+ int conn_khelper(struct drbd_tconn *tconn, char *cmd)
+ {
+       char *envp[] = { "HOME=/",
+                       "TERM=linux",
+                       "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+                        (char[20]) { }, /* address family */
+                        (char[60]) { }, /* address */
+                       NULL };
+       char *argv[] = {usermode_helper, cmd, tconn->name, NULL };
+       int ret;
+       setup_khelper_env(tconn, envp);
+       conn_md_sync(tconn);
+       conn_info(tconn, "helper command: %s %s %s\n", usermode_helper, cmd, tconn->name);
+       /* TODO: conn_bcast_event() ?? */
 -      ret = call_usermodehelper(usermode_helper, argv, envp, 1);
++      ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
+       if (ret)
+               conn_warn(tconn, "helper command: %s %s %s exit code %u (0x%x)\n",
+                         usermode_helper, cmd, tconn->name,
+                         (ret >> 8) & 0xff, ret);
+       else
+               conn_info(tconn, "helper command: %s %s %s exit code %u (0x%x)\n",
+                         usermode_helper, cmd, tconn->name,
+                         (ret >> 8) & 0xff, ret);
+       /* TODO: conn_bcast_event() ?? */
  
        if (ret < 0) /* Ignore any ERRNOs we got. */
                ret = 0;
        return ret;
  }
  
- enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
+ static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn)
+ {
+       enum drbd_fencing_p fp = FP_NOT_AVAIL;
+       struct drbd_conf *mdev;
+       int vnr;
+       rcu_read_lock();
+       idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+               if (get_ldev_if_state(mdev, D_CONSISTENT)) {
+                       fp = max_t(enum drbd_fencing_p, fp,
+                                  rcu_dereference(mdev->ldev->disk_conf)->fencing);
+                       put_ldev(mdev);
+               }
+       }
+       rcu_read_unlock();
+       return fp;
+ }
+ bool conn_try_outdate_peer(struct drbd_tconn *tconn)
  {
+       union drbd_state mask = { };
+       union drbd_state val = { };
+       enum drbd_fencing_p fp;
        char *ex_to_string;
        int r;
-       enum drbd_disk_state nps;
-       enum drbd_fencing_p fp;
  
-       D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
+       if (tconn->cstate >= C_WF_REPORT_PARAMS) {
+               conn_err(tconn, "Expected cstate < C_WF_REPORT_PARAMS\n");
+               return false;
+       }
  
-       if (get_ldev_if_state(mdev, D_CONSISTENT)) {
-               fp = mdev->ldev->dc.fencing;
-               put_ldev(mdev);
-       } else {
-               dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
-               nps = mdev->state.pdsk;
+       fp = highest_fencing_policy(tconn);
+       switch (fp) {
+       case FP_NOT_AVAIL:
+               conn_warn(tconn, "Not fencing peer, I'm not even Consistent myself.\n");
                goto out;
+       case FP_DONT_CARE:
+               return true;
+       default: ;
        }
  
-       r = drbd_khelper(mdev, "fence-peer");
+       r = conn_khelper(tconn, "fence-peer");
  
        switch ((r>>8) & 0xff) {
        case 3: /* peer is inconsistent */
                ex_to_string = "peer is inconsistent or worse";
-               nps = D_INCONSISTENT;
+               mask.pdsk = D_MASK;
+               val.pdsk = D_INCONSISTENT;
                break;
        case 4: /* peer got outdated, or was already outdated */
                ex_to_string = "peer was fenced";
-               nps = D_OUTDATED;
+               mask.pdsk = D_MASK;
+               val.pdsk = D_OUTDATED;
                break;
        case 5: /* peer was down */
-               if (mdev->state.disk == D_UP_TO_DATE) {
+               if (conn_highest_disk(tconn) == D_UP_TO_DATE) {
                        /* we will(have) create(d) a new UUID anyways... */
                        ex_to_string = "peer is unreachable, assumed to be dead";
-                       nps = D_OUTDATED;
+                       mask.pdsk = D_MASK;
+                       val.pdsk = D_OUTDATED;
                } else {
                        ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
-                       nps = mdev->state.pdsk;
                }
                break;
        case 6: /* Peer is primary, voluntarily outdate myself.
                 * This is useful when an unconnected R_SECONDARY is asked to
                 * become R_PRIMARY, but finds the other peer being active. */
                ex_to_string = "peer is active";
-               dev_warn(DEV, "Peer is primary, outdating myself.\n");
-               nps = D_UNKNOWN;
-               _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE);
+               conn_warn(tconn, "Peer is primary, outdating myself.\n");
+               mask.disk = D_MASK;
+               val.disk = D_OUTDATED;
                break;
        case 7:
                if (fp != FP_STONITH)
-                       dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n");
+                       conn_err(tconn, "fence-peer() = 7 && fencing != Stonith !!!\n");
                ex_to_string = "peer was stonithed";
-               nps = D_OUTDATED;
+               mask.pdsk = D_MASK;
+               val.pdsk = D_OUTDATED;
                break;
        default:
                /* The script is broken ... */
-               nps = D_UNKNOWN;
-               dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
-               return nps;
+               conn_err(tconn, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
+               return false; /* Eventually leave IO frozen */
        }
  
-       dev_info(DEV, "fence-peer helper returned %d (%s)\n",
-                       (r>>8) & 0xff, ex_to_string);
+       conn_info(tconn, "fence-peer helper returned %d (%s)\n",
+                 (r>>8) & 0xff, ex_to_string);
  
- out:
-       if (mdev->state.susp_fen && nps >= D_UNKNOWN) {
-               /* The handler was not successful... unfreeze here, the
-                  state engine can not unfreeze... */
-               _drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE);
-       }
+  out:
  
-       return nps;
+       /* Not using
+          conn_request_state(tconn, mask, val, CS_VERBOSE);
+          here, because we might were able to re-establish the connection in the
+          meantime. */
+       spin_lock_irq(&tconn->req_lock);
+       if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags))
+               _conn_request_state(tconn, mask, val, CS_VERBOSE);
+       spin_unlock_irq(&tconn->req_lock);
+       return conn_highest_pdsk(tconn) <= D_OUTDATED;
  }
  
  static int _try_outdate_peer_async(void *data)
  {
-       struct drbd_conf *mdev = (struct drbd_conf *)data;
-       enum drbd_disk_state nps;
-       union drbd_state ns;
+       struct drbd_tconn *tconn = (struct drbd_tconn *)data;
  
-       nps = drbd_try_outdate_peer(mdev);
-       /* Not using
-          drbd_request_state(mdev, NS(pdsk, nps));
-          here, because we might were able to re-establish the connection
-          in the meantime. This can only partially be solved in the state's
-          engine is_valid_state() and is_valid_state_transition()
-          functions.
-          nps can be D_INCONSISTENT, D_OUTDATED or D_UNKNOWN.
-          pdsk == D_INCONSISTENT while conn >= C_CONNECTED is valid,
-          therefore we have to have the pre state change check here.
-       */
-       spin_lock_irq(&mdev->req_lock);
-       ns = mdev->state;
-       if (ns.conn < C_WF_REPORT_PARAMS && !drbd_test_flag(mdev, STATE_SENT)) {
-               ns.pdsk = nps;
-               _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
-       }
-       spin_unlock_irq(&mdev->req_lock);
+       conn_try_outdate_peer(tconn);
  
+       kref_put(&tconn->kref, &conn_destroy);
        return 0;
  }
  
- void drbd_try_outdate_peer_async(struct drbd_conf *mdev)
+ void conn_try_outdate_peer_async(struct drbd_tconn *tconn)
  {
        struct task_struct *opa;
  
-       opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev));
-       if (IS_ERR(opa))
-               dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n");
+       kref_get(&tconn->kref);
+       opa = kthread_run(_try_outdate_peer_async, tconn, "drbd_async_h");
+       if (IS_ERR(opa)) {
+               conn_err(tconn, "out of mem, failed to invoke fence-peer helper\n");
+               kref_put(&tconn->kref, &conn_destroy);
+       }
  }
  
  enum drbd_state_rv
@@@ -318,15 -527,15 +526,15 @@@ drbd_set_role(struct drbd_conf *mdev, e
  {
        const int max_tries = 4;
        enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
+       struct net_conf *nc;
        int try = 0;
        int forced = 0;
        union drbd_state mask, val;
-       enum drbd_disk_state nps;
  
        if (new_role == R_PRIMARY)
-               request_ping(mdev); /* Detect a dead peer ASAP */
+               request_ping(mdev->tconn); /* Detect a dead peer ASAP */
  
-       mutex_lock(&mdev->state_mutex);
+       mutex_lock(mdev->state_mutex);
  
        mask.i = 0; mask.role = R_MASK;
        val.i  = 0; val.role  = new_role;
                if (rv == SS_NO_UP_TO_DATE_DISK &&
                    mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
                        D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
-                       nps = drbd_try_outdate_peer(mdev);
  
-                       if (nps == D_OUTDATED || nps == D_INCONSISTENT) {
+                       if (conn_try_outdate_peer(mdev->tconn)) {
                                val.disk = D_UP_TO_DATE;
                                mask.disk = D_MASK;
                        }
-                       val.pdsk = nps;
-                       mask.pdsk = D_MASK;
                        continue;
                }
  
                if (rv == SS_NOTHING_TO_DO)
-                       goto fail;
+                       goto out;
                if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
-                       nps = drbd_try_outdate_peer(mdev);
-                       if (force && nps > D_OUTDATED) {
+                       if (!conn_try_outdate_peer(mdev->tconn) && force) {
                                dev_warn(DEV, "Forced into split brain situation!\n");
-                               nps = D_OUTDATED;
-                       }
-                       mask.pdsk = D_MASK;
-                       val.pdsk  = nps;
+                               mask.pdsk = D_MASK;
+                               val.pdsk  = D_OUTDATED;
  
+                       }
                        continue;
                }
                if (rv == SS_TWO_PRIMARIES) {
                        /* Maybe the peer is detected as dead very soon...
                           retry at most once more in this case. */
-                       schedule_timeout_interruptible((mdev->net_conf->ping_timeo+1)*HZ/10);
+                       int timeo;
+                       rcu_read_lock();
+                       nc = rcu_dereference(mdev->tconn->net_conf);
+                       timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
+                       rcu_read_unlock();
+                       schedule_timeout_interruptible(timeo);
                        if (try < max_tries)
                                try = max_tries - 1;
                        continue;
                        rv = _drbd_request_state(mdev, mask, val,
                                                CS_VERBOSE + CS_WAIT_COMPLETE);
                        if (rv < SS_SUCCESS)
-                               goto fail;
+                               goto out;
                }
                break;
        }
  
        if (rv < SS_SUCCESS)
-               goto fail;
+               goto out;
  
        if (forced)
                dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
        /* Wait until nothing is on the fly :) */
        wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
  
+       /* FIXME also wait for all pending P_BARRIER_ACK? */
        if (new_role == R_SECONDARY) {
                set_disk_ro(mdev->vdisk, true);
                if (get_ldev(mdev)) {
                        put_ldev(mdev);
                }
        } else {
-               if (get_net_conf(mdev)) {
-                       mdev->net_conf->want_lose = 0;
-                       put_net_conf(mdev);
-               }
+               mutex_lock(&mdev->tconn->conf_update);
+               nc = mdev->tconn->net_conf;
+               if (nc)
+                       nc->discard_my_data = 0; /* without copy; single bit op is atomic */
+               mutex_unlock(&mdev->tconn->conf_update);
                set_disk_ro(mdev->vdisk, false);
                if (get_ldev(mdev)) {
                        if (((mdev->state.conn < C_CONNECTED ||
        drbd_md_sync(mdev);
  
        kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
 fail:
-       mutex_unlock(&mdev->state_mutex);
out:
+       mutex_unlock(mdev->state_mutex);
        return rv;
  }
  
- static struct drbd_conf *ensure_mdev(int minor, int create)
+ static const char *from_attrs_err_to_txt(int err)
  {
-       struct drbd_conf *mdev;
-       if (minor >= minor_count)
-               return NULL;
-       mdev = minor_to_mdev(minor);
-       if (!mdev && create) {
-               struct gendisk *disk = NULL;
-               mdev = drbd_new_device(minor);
-               spin_lock_irq(&drbd_pp_lock);
-               if (minor_table[minor] == NULL) {
-                       minor_table[minor] = mdev;
-                       disk = mdev->vdisk;
-                       mdev = NULL;
-               } /* else: we lost the race */
-               spin_unlock_irq(&drbd_pp_lock);
-               if (disk) /* we won the race above */
-                       /* in case we ever add a drbd_delete_device(),
-                        * don't forget the del_gendisk! */
-                       add_disk(disk);
-               else /* we lost the race above */
-                       drbd_free_mdev(mdev);
-               mdev = minor_to_mdev(minor);
-       }
-       return mdev;
+       return  err == -ENOMSG ? "required attribute missing" :
+               err == -EOPNOTSUPP ? "unknown mandatory attribute" :
+               err == -EEXIST ? "can not change invariant setting" :
+               "invalid attribute value";
  }
  
- static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                          struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
  {
-       struct primary primary_args;
-       memset(&primary_args, 0, sizeof(struct primary));
-       if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) {
-               reply->ret_code = ERR_MANDATORY_TAG;
-               return 0;
-       }
-       reply->ret_code =
-               drbd_set_role(mdev, R_PRIMARY, primary_args.primary_force);
+       struct set_role_parms parms;
+       int err;
+       enum drbd_ret_code retcode;
  
-       return 0;
- }
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
  
- static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                            struct drbd_nl_cfg_reply *reply)
- {
-       reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0);
+       memset(&parms, 0, sizeof(parms));
+       if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) {
+               err = set_role_parms_from_attrs(&parms, info);
+               if (err) {
+                       retcode = ERR_MANDATORY_TAG;
+                       drbd_msg_put_info(from_attrs_err_to_txt(err));
+                       goto out;
+               }
+       }
  
+       if (info->genlhdr->cmd == DRBD_ADM_PRIMARY)
+               retcode = drbd_set_role(adm_ctx.mdev, R_PRIMARY, parms.assume_uptodate);
+       else
+               retcode = drbd_set_role(adm_ctx.mdev, R_SECONDARY, 0);
+ out:
+       drbd_adm_finish(info, retcode);
        return 0;
  }
  
@@@ -514,7 -703,12 +702,12 @@@ static void drbd_md_set_sector_offsets(
                                       struct drbd_backing_dev *bdev)
  {
        sector_t md_size_sect = 0;
-       switch (bdev->dc.meta_dev_idx) {
+       int meta_dev_idx;
+       rcu_read_lock();
+       meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+       switch (meta_dev_idx) {
        default:
                /* v07 style fixed size indexed meta data */
                bdev->md.md_size_sect = MD_RESERVED_SECT;
        case DRBD_MD_INDEX_FLEX_INT:
                bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
                /* al size is still fixed */
-               bdev->md.al_offset = -MD_AL_MAX_SIZE;
+               bdev->md.al_offset = -MD_AL_SECTORS;
                /* we need (slightly less than) ~ this much bitmap sectors: */
                md_size_sect = drbd_get_capacity(bdev->backing_bdev);
                md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
                bdev->md.bm_offset   = -md_size_sect + MD_AL_OFFSET;
                break;
        }
+       rcu_read_unlock();
  }
  
  /* input size is expected to be in KB */
@@@ -581,17 -776,23 +775,23 @@@ char *ppsize(char *buf, unsigned long l
   *  R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
   *  peer may not initiate a resize.
   */
+ /* Note these are not to be confused with
+  * drbd_adm_suspend_io/drbd_adm_resume_io,
+  * which are (sub) state changes triggered by admin (drbdsetup),
+  * and can be long lived.
+  * This changes an mdev->flag, is triggered by drbd internals,
+  * and should be short-lived. */
  void drbd_suspend_io(struct drbd_conf *mdev)
  {
-       drbd_set_flag(mdev, SUSPEND_IO);
-       if (is_susp(mdev->state))
+       set_bit(SUSPEND_IO, &mdev->flags);
+       if (drbd_suspended(mdev))
                return;
        wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
  }
  
  void drbd_resume_io(struct drbd_conf *mdev)
  {
-       drbd_clear_flag(mdev, SUSPEND_IO);
+       clear_bit(SUSPEND_IO, &mdev->flags);
        wake_up(&mdev->misc_wait);
  }
  
  enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
  {
        sector_t prev_first_sect, prev_size; /* previous meta location */
-       sector_t la_size;
+       sector_t la_size, u_size;
        sector_t size;
        char ppb[10];
  
        /* TODO: should only be some assert here, not (re)init... */
        drbd_md_set_sector_offsets(mdev, mdev->ldev);
  
-       size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED);
+       rcu_read_lock();
+       u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+       rcu_read_unlock();
+       size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED);
  
        if (drbd_get_capacity(mdev->this_bdev) != size ||
            drbd_bm_capacity(mdev) != size) {
@@@ -696,12 -900,12 +899,12 @@@ out
  }
  
  sector_t
- drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int assume_peer_has_space)
+ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+                 sector_t u_size, int assume_peer_has_space)
  {
        sector_t p_size = mdev->p_size;   /* partner's disk size. */
        sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
        sector_t m_size; /* my size */
-       sector_t u_size = bdev->dc.disk_size; /* size requested by user. */
        sector_t size = 0;
  
        m_size = drbd_get_max_capacity(bdev);
   * failed, and 0 on success. You should call drbd_md_sync() after you called
   * this function.
   */
- static int drbd_check_al_size(struct drbd_conf *mdev)
+ static int drbd_check_al_size(struct drbd_conf *mdev, struct disk_conf *dc)
  {
        struct lru_cache *n, *t;
        struct lc_element *e;
        unsigned int in_use;
        int i;
  
-       ERR_IF(mdev->sync_conf.al_extents < 7)
-               mdev->sync_conf.al_extents = 127;
        if (mdev->act_log &&
-           mdev->act_log->nr_elements == mdev->sync_conf.al_extents)
+           mdev->act_log->nr_elements == dc->al_extents)
                return 0;
  
        in_use = 0;
        t = mdev->act_log;
-       n = lc_create("act_log", drbd_al_ext_cache,
-               mdev->sync_conf.al_extents, sizeof(struct lc_element), 0);
+       n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION,
+               dc->al_extents, sizeof(struct lc_element), 0);
  
        if (n == NULL) {
                dev_err(DEV, "Cannot allocate act_log lru!\n");
  static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size)
  {
        struct request_queue * const q = mdev->rq_queue;
 -      int max_hw_sectors = max_bio_size >> 9;
 -      int max_segments = 0;
 +      unsigned int max_hw_sectors = max_bio_size >> 9;
 +      unsigned int max_segments = 0;
  
        if (get_ldev_if_state(mdev, D_ATTACHING)) {
                struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
  
                max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
-               max_segments = mdev->ldev->dc.max_bio_bvecs;
+               rcu_read_lock();
+               max_segments = rcu_dereference(mdev->ldev->disk_conf)->max_bio_bvecs;
+               rcu_read_unlock();
                put_ldev(mdev);
        }
  
  
  void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
  {
 -      int now, new, local, peer;
 +      unsigned int now, new, local, peer;
  
        now = queue_max_hw_sectors(mdev->rq_queue) << 9;
        local = mdev->local_max_bio_size; /* Eventually last known value, from volatile memory */
                mdev->local_max_bio_size = local;
                put_ldev(mdev);
        }
 +      local = min(local, DRBD_MAX_BIO_SIZE);
  
        /* We may ignore peer limits if the peer is modern enough.
           Because new from 8.3.8 onwards the peer can use multiple
           BIOs for a single peer_request */
        if (mdev->state.conn >= C_CONNECTED) {
-               if (mdev->agreed_pro_version < 94) {
-                       peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+               if (mdev->tconn->agreed_pro_version < 94)
 -                      peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
++                      peer = min( mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
                        /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
-               } else if (mdev->agreed_pro_version == 94)
+               else if (mdev->tconn->agreed_pro_version == 94)
                        peer = DRBD_MAX_SIZE_H80_PACKET;
-               else /* drbd 8.3.8 onwards */
+               else if (mdev->tconn->agreed_pro_version < 100)
+                       peer = DRBD_MAX_BIO_SIZE_P95;  /* drbd 8.3.8 onwards, before 8.4.0 */
+               else
                        peer = DRBD_MAX_BIO_SIZE;
        }
  
 -      new = min_t(int, local, peer);
 +      new = min(local, peer);
  
        if (mdev->state.role == R_PRIMARY && new < now)
 -              dev_err(DEV, "ASSERT FAILED new < now; (%d < %d)\n", new, now);
 +              dev_err(DEV, "ASSERT FAILED new < now; (%u < %u)\n", new, now);
  
        if (new != now)
                dev_info(DEV, "max BIO size = %u\n", new);
        drbd_setup_queue_param(mdev, new);
  }
  
- /* serialize deconfig (worker exiting, doing cleanup)
-  * and reconfig (drbdsetup disk, drbdsetup net)
-  *
-  * Wait for a potentially exiting worker, then restart it,
-  * or start a new one.  Flush any pending work, there may still be an
-  * after_state_change queued.
-  */
- static void drbd_reconfig_start(struct drbd_conf *mdev)
+ /* Starts the worker thread */
+ static void conn_reconfig_start(struct drbd_tconn *tconn)
  {
-       wait_event(mdev->state_wait, !drbd_test_and_set_flag(mdev, CONFIG_PENDING));
-       wait_event(mdev->state_wait, !drbd_test_flag(mdev, DEVICE_DYING));
-       drbd_thread_start(&mdev->worker);
-       drbd_flush_workqueue(mdev);
+       drbd_thread_start(&tconn->worker);
+       conn_flush_workqueue(tconn);
  }
  
- /* if still unconfigured, stops worker again.
-  * if configured now, clears CONFIG_PENDING.
-  * wakes potential waiters */
- static void drbd_reconfig_done(struct drbd_conf *mdev)
+ /* if still unconfigured, stops worker again. */
+ static void conn_reconfig_done(struct drbd_tconn *tconn)
  {
-       spin_lock_irq(&mdev->req_lock);
-       if (mdev->state.disk == D_DISKLESS &&
-           mdev->state.conn == C_STANDALONE &&
-           mdev->state.role == R_SECONDARY) {
-               drbd_set_flag(mdev, DEVICE_DYING);
-               drbd_thread_stop_nowait(&mdev->worker);
-       } else
-               drbd_clear_flag(mdev, CONFIG_PENDING);
-       spin_unlock_irq(&mdev->req_lock);
-       wake_up(&mdev->state_wait);
+       bool stop_threads;
+       spin_lock_irq(&tconn->req_lock);
+       stop_threads = conn_all_vols_unconf(tconn) &&
+               tconn->cstate == C_STANDALONE;
+       spin_unlock_irq(&tconn->req_lock);
+       if (stop_threads) {
+               /* asender is implicitly stopped by receiver
+                * in conn_disconnect() */
+               drbd_thread_stop(&tconn->receiver);
+               drbd_thread_stop(&tconn->worker);
+       }
  }
  
  /* Make sure IO is suspended before calling this function(). */
@@@ -909,42 -1104,182 +1104,182 @@@ static void drbd_suspend_al(struct drbd
  {
        int s = 0;
  
-       if (lc_try_lock(mdev->act_log)) {
-               drbd_al_shrink(mdev);
-               lc_unlock(mdev->act_log);
-       } else {
+       if (!lc_try_lock(mdev->act_log)) {
                dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n");
                return;
        }
  
-       spin_lock_irq(&mdev->req_lock);
+       drbd_al_shrink(mdev);
+       spin_lock_irq(&mdev->tconn->req_lock);
        if (mdev->state.conn < C_CONNECTED)
-               s = !drbd_test_and_set_flag(mdev, AL_SUSPENDED);
-       spin_unlock_irq(&mdev->req_lock);
+               s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags);
+       spin_unlock_irq(&mdev->tconn->req_lock);
+       lc_unlock(mdev->act_log);
  
        if (s)
                dev_info(DEV, "Suspended AL updates\n");
  }
  
- /* does always return 0;
-  * interesting return code is in reply->ret_code */
- static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                            struct drbd_nl_cfg_reply *reply)
+ static bool should_set_defaults(struct genl_info *info)
+ {
+       unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags;
+       return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
+ }
+ static void enforce_disk_conf_limits(struct disk_conf *dc)
+ {
+       if (dc->al_extents < DRBD_AL_EXTENTS_MIN)
+               dc->al_extents = DRBD_AL_EXTENTS_MIN;
+       if (dc->al_extents > DRBD_AL_EXTENTS_MAX)
+               dc->al_extents = DRBD_AL_EXTENTS_MAX;
+       if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+               dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+ }
+ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
+ {
+       enum drbd_ret_code retcode;
+       struct drbd_conf *mdev;
+       struct disk_conf *new_disk_conf, *old_disk_conf;
+       struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
+       int err, fifo_size;
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
+       mdev = adm_ctx.mdev;
+       /* we also need a disk
+        * to change the options on */
+       if (!get_ldev(mdev)) {
+               retcode = ERR_NO_DISK;
+               goto out;
+       }
+       new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+       if (!new_disk_conf) {
+               retcode = ERR_NOMEM;
+               goto fail;
+       }
+       mutex_lock(&mdev->tconn->conf_update);
+       old_disk_conf = mdev->ldev->disk_conf;
+       *new_disk_conf = *old_disk_conf;
+       if (should_set_defaults(info))
+               set_disk_conf_defaults(new_disk_conf);
+       err = disk_conf_from_attrs_for_change(new_disk_conf, info);
+       if (err && err != -ENOMSG) {
+               retcode = ERR_MANDATORY_TAG;
+               drbd_msg_put_info(from_attrs_err_to_txt(err));
+       }
+       if (!expect(new_disk_conf->resync_rate >= 1))
+               new_disk_conf->resync_rate = 1;
+       enforce_disk_conf_limits(new_disk_conf);
+       fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+       if (fifo_size != mdev->rs_plan_s->size) {
+               new_plan = fifo_alloc(fifo_size);
+               if (!new_plan) {
+                       dev_err(DEV, "kmalloc of fifo_buffer failed");
+                       retcode = ERR_NOMEM;
+                       goto fail_unlock;
+               }
+       }
+       drbd_suspend_io(mdev);
+       wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+       drbd_al_shrink(mdev);
+       err = drbd_check_al_size(mdev, new_disk_conf);
+       lc_unlock(mdev->act_log);
+       wake_up(&mdev->al_wait);
+       drbd_resume_io(mdev);
+       if (err) {
+               retcode = ERR_NOMEM;
+               goto fail_unlock;
+       }
+       write_lock_irq(&global_state_lock);
+       retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after);
+       if (retcode == NO_ERROR) {
+               rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
+               drbd_resync_after_changed(mdev);
+       }
+       write_unlock_irq(&global_state_lock);
+       if (retcode != NO_ERROR)
+               goto fail_unlock;
+       if (new_plan) {
+               old_plan = mdev->rs_plan_s;
+               rcu_assign_pointer(mdev->rs_plan_s, new_plan);
+       }
+       mutex_unlock(&mdev->tconn->conf_update);
+       if (new_disk_conf->al_updates)
+               mdev->ldev->md.flags &= ~MDF_AL_DISABLED;
+       else
+               mdev->ldev->md.flags |= MDF_AL_DISABLED;
+       drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush);
+       drbd_md_sync(mdev);
+       if (mdev->state.conn >= C_CONNECTED)
+               drbd_send_sync_param(mdev);
+       synchronize_rcu();
+       kfree(old_disk_conf);
+       kfree(old_plan);
+       mod_timer(&mdev->request_timer, jiffies + HZ);
+       goto success;
+ fail_unlock:
+       mutex_unlock(&mdev->tconn->conf_update);
+  fail:
+       kfree(new_disk_conf);
+       kfree(new_plan);
+ success:
+       put_ldev(mdev);
+  out:
+       drbd_adm_finish(info, retcode);
+       return 0;
+ }
+ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
  {
+       struct drbd_conf *mdev;
+       int err;
        enum drbd_ret_code retcode;
        enum determine_dev_size dd;
        sector_t max_possible_sectors;
        sector_t min_md_device_sectors;
        struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
+       struct disk_conf *new_disk_conf = NULL;
        struct block_device *bdev;
        struct lru_cache *resync_lru = NULL;
+       struct fifo_buffer *new_plan = NULL;
        union drbd_state ns, os;
        enum drbd_state_rv rv;
-       int cp_discovered = 0;
-       int logical_block_size;
+       struct net_conf *nc;
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto finish;
  
-       drbd_reconfig_start(mdev);
+       mdev = adm_ctx.mdev;
+       conn_reconfig_start(mdev->tconn);
  
        /* if you want to reconfigure, please tear down first */
        if (mdev->state.disk > D_DISKLESS) {
        wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
  
        /* make sure there is no leftover from previous force-detach attempts */
-       drbd_clear_flag(mdev, FORCE_DETACH);
-       drbd_clear_flag(mdev, WAS_IO_ERROR);
-       drbd_clear_flag(mdev, WAS_READ_ERROR);
+       clear_bit(FORCE_DETACH, &mdev->flags);
+       clear_bit(WAS_IO_ERROR, &mdev->flags);
+       clear_bit(WAS_READ_ERROR, &mdev->flags);
  
        /* and no leftover from previously aborted resync or verify, either */
        mdev->rs_total = 0;
        mdev->rs_failed = 0;
        atomic_set(&mdev->rs_pending_cnt, 0);
  
-       /* allocation not in the IO path, cqueue thread context */
+       /* allocation not in the IO path, drbdsetup context */
        nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
        if (!nbc) {
                retcode = ERR_NOMEM;
                goto fail;
        }
-       nbc->dc.disk_size     = DRBD_DISK_SIZE_SECT_DEF;
-       nbc->dc.on_io_error   = DRBD_ON_IO_ERROR_DEF;
-       nbc->dc.fencing       = DRBD_FENCING_DEF;
-       nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF;
        spin_lock_init(&nbc->md.uuid_lock);
  
-       if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) {
-               retcode = ERR_MANDATORY_TAG;
+       new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+       if (!new_disk_conf) {
+               retcode = ERR_NOMEM;
                goto fail;
        }
+       nbc->disk_conf = new_disk_conf;
  
-       if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
-               retcode = ERR_MD_IDX_INVALID;
+       set_disk_conf_defaults(new_disk_conf);
+       err = disk_conf_from_attrs(new_disk_conf, info);
+       if (err) {
+               retcode = ERR_MANDATORY_TAG;
+               drbd_msg_put_info(from_attrs_err_to_txt(err));
                goto fail;
        }
  
-       if (get_net_conf(mdev)) {
-               int prot = mdev->net_conf->wire_protocol;
-               put_net_conf(mdev);
-               if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) {
-                       retcode = ERR_STONITH_AND_PROT_A;
+       enforce_disk_conf_limits(new_disk_conf);
+       new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
+       if (!new_plan) {
+               retcode = ERR_NOMEM;
+               goto fail;
+       }
+       if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
+               retcode = ERR_MD_IDX_INVALID;
+               goto fail;
+       }
+       rcu_read_lock();
+       nc = rcu_dereference(mdev->tconn->net_conf);
+       if (nc) {
+               if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
+                       rcu_read_unlock();
+                       retcode = ERR_STONITH_AND_PROT_A;
                        goto fail;
                }
        }
+       rcu_read_unlock();
  
-       bdev = blkdev_get_by_path(nbc->dc.backing_dev,
+       bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
                                  FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev);
        if (IS_ERR(bdev)) {
-               dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
+               dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
                        PTR_ERR(bdev));
                retcode = ERR_OPEN_DISK;
                goto fail;
         * should check it for you already; but if you don't, or
         * someone fooled it, we need to double check here)
         */
-       bdev = blkdev_get_by_path(nbc->dc.meta_dev,
+       bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
                                  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
-                                 (nbc->dc.meta_dev_idx < 0) ?
+                                 (new_disk_conf->meta_dev_idx < 0) ?
                                  (void *)mdev : (void *)drbd_m_holder);
        if (IS_ERR(bdev)) {
-               dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
+               dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
                        PTR_ERR(bdev));
                retcode = ERR_OPEN_MD_DISK;
                goto fail;
        nbc->md_bdev = bdev;
  
        if ((nbc->backing_bdev == nbc->md_bdev) !=
-           (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
-            nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+           (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+            new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
                retcode = ERR_MD_IDX_INVALID;
                goto fail;
        }
  
        resync_lru = lc_create("resync", drbd_bm_ext_cache,
-                       61, sizeof(struct bm_extent),
+                       1, 61, sizeof(struct bm_extent),
                        offsetof(struct bm_extent, lce));
        if (!resync_lru) {
                retcode = ERR_NOMEM;
        /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
        drbd_md_set_sector_offsets(mdev, nbc);
  
-       if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) {
+       if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
                dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
                        (unsigned long long) drbd_get_max_capacity(nbc),
-                       (unsigned long long) nbc->dc.disk_size);
+                       (unsigned long long) new_disk_conf->disk_size);
                retcode = ERR_DISK_TOO_SMALL;
                goto fail;
        }
  
-       if (nbc->dc.meta_dev_idx < 0) {
+       if (new_disk_conf->meta_dev_idx < 0) {
                max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
                /* at least one MB, otherwise it does not make sense */
                min_md_device_sectors = (2<<10);
        } else {
                max_possible_sectors = DRBD_MAX_SECTORS;
-               min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1);
+               min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1);
        }
  
        if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
                dev_warn(DEV, "==> truncating very big lower level device "
                        "to currently maximum possible %llu sectors <==\n",
                        (unsigned long long) max_possible_sectors);
-               if (nbc->dc.meta_dev_idx >= 0)
+               if (new_disk_conf->meta_dev_idx >= 0)
                        dev_warn(DEV, "==>> using internal or flexible "
                                      "meta data may help <<==\n");
        }
  
        drbd_suspend_io(mdev);
        /* also wait for the last barrier ack. */
-       wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state));
+       /* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171
+        * We need a way to either ignore barrier acks for barriers sent before a device
+        * was attached, or a way to wait for all pending barrier acks to come in.
+        * As barriers are counted per resource,
+        * we'd need to suspend io on all devices of a resource.
+        */
+       wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || drbd_suspended(mdev));
        /* and for any other previously queued work */
        drbd_flush_workqueue(mdev);
  
  
        drbd_md_set_sector_offsets(mdev, nbc);
  
-       /* allocate a second IO page if logical_block_size != 512 */
-       logical_block_size = bdev_logical_block_size(nbc->md_bdev);
-       if (logical_block_size == 0)
-               logical_block_size = MD_SECTOR_SIZE;
-       if (logical_block_size != MD_SECTOR_SIZE) {
-               if (!mdev->md_io_tmpp) {
-                       struct page *page = alloc_page(GFP_NOIO);
-                       if (!page)
-                               goto force_diskless_dec;
-                       dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n",
-                            logical_block_size, MD_SECTOR_SIZE);
-                       dev_warn(DEV, "Workaround engaged (has performance impact).\n");
-                       mdev->md_io_tmpp = page;
-               }
-       }
        if (!mdev->bitmap) {
                if (drbd_bm_init(mdev)) {
                        retcode = ERR_NOMEM;
        }
  
        /* Since we are diskless, fix the activity log first... */
-       if (drbd_check_al_size(mdev)) {
+       if (drbd_check_al_size(mdev, new_disk_conf)) {
                retcode = ERR_NOMEM;
                goto force_diskless_dec;
        }
  
        /* Prevent shrinking of consistent devices ! */
        if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
-           drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) {
+           drbd_new_dev_size(mdev, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) {
                dev_warn(DEV, "refusing to truncate a consistent device\n");
                retcode = ERR_DISK_TOO_SMALL;
                goto force_diskless_dec;
        }
  
-       if (!drbd_al_read_log(mdev, nbc)) {
-               retcode = ERR_IO_MD_DISK;
-               goto force_diskless_dec;
-       }
        /* Reset the "barriers don't work" bits here, then force meta data to
         * be written, to ensure we determine if barriers are supported. */
-       if (nbc->dc.no_md_flush)
-               drbd_set_flag(mdev, MD_NO_FUA);
+       if (new_disk_conf->md_flushes)
+               clear_bit(MD_NO_FUA, &mdev->flags);
        else
-               drbd_clear_flag(mdev, MD_NO_FUA);
+               set_bit(MD_NO_FUA, &mdev->flags);
  
        /* Point of no return reached.
         * Devices and memory are no longer released by error cleanup below.
        D_ASSERT(mdev->ldev == NULL);
        mdev->ldev = nbc;
        mdev->resync = resync_lru;
+       mdev->rs_plan_s = new_plan;
        nbc = NULL;
        resync_lru = NULL;
+       new_disk_conf = NULL;
+       new_plan = NULL;
  
-       mdev->write_ordering = WO_bdev_flush;
-       drbd_bump_write_ordering(mdev, WO_bdev_flush);
+       drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush);
  
        if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
-               drbd_set_flag(mdev, CRASHED_PRIMARY);
+               set_bit(CRASHED_PRIMARY, &mdev->flags);
        else
-               drbd_clear_flag(mdev, CRASHED_PRIMARY);
+               clear_bit(CRASHED_PRIMARY, &mdev->flags);
  
        if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
-           !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) {
-               drbd_set_flag(mdev, CRASHED_PRIMARY);
-               cp_discovered = 1;
-       }
+           !(mdev->state.role == R_PRIMARY && mdev->tconn->susp_nod))
+               set_bit(CRASHED_PRIMARY, &mdev->flags);
  
        mdev->send_cnt = 0;
        mdev->recv_cnt = 0;
         * so we can automatically recover from a crash of a
         * degraded but active "cluster" after a certain timeout.
         */
-       drbd_clear_flag(mdev, USE_DEGR_WFC_T);
+       clear_bit(USE_DEGR_WFC_T, &mdev->flags);
        if (mdev->state.role != R_PRIMARY &&
             drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
            !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
-               drbd_set_flag(mdev, USE_DEGR_WFC_T);
+               set_bit(USE_DEGR_WFC_T, &mdev->flags);
  
        dd = drbd_determine_dev_size(mdev, 0);
        if (dd == dev_size_error) {
                retcode = ERR_NOMEM_BITMAP;
                goto force_diskless_dec;
        } else if (dd == grew)
-               drbd_set_flag(mdev, RESYNC_AFTER_NEG);
+               set_bit(RESYNC_AFTER_NEG, &mdev->flags);
  
-       if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
+       if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) ||
+           (test_bit(CRASHED_PRIMARY, &mdev->flags) &&
+            drbd_md_test_flag(mdev->ldev, MDF_AL_DISABLED))) {
                dev_info(DEV, "Assuming that all blocks are out of sync "
                     "(aka FullSync)\n");
                if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write,
                }
        } else {
                if (drbd_bitmap_io(mdev, &drbd_bm_read,
-                       "read from attaching", BM_LOCKED_MASK) < 0) {
-                       retcode = ERR_IO_MD_DISK;
-                       goto force_diskless_dec;
-               }
-       }
-       if (cp_discovered) {
-               drbd_al_apply_to_bm(mdev);
-               if (drbd_bitmap_io(mdev, &drbd_bm_write,
-                       "crashed primary apply AL", BM_LOCKED_MASK)) {
+                       "read from attaching", BM_LOCKED_MASK)) {
                        retcode = ERR_IO_MD_DISK;
                        goto force_diskless_dec;
                }
        if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
                drbd_suspend_al(mdev); /* IO is still suspended here... */
  
-       spin_lock_irq(&mdev->req_lock);
-       os = mdev->state;
-       ns.i = os.i;
+       spin_lock_irq(&mdev->tconn->req_lock);
+       os = drbd_read_state(mdev);
+       ns = os;
        /* If MDF_CONSISTENT is not set go into inconsistent state,
           otherwise investigate MDF_WasUpToDate...
           If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
        if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED))
                ns.pdsk = D_OUTDATED;
  
-       if ( ns.disk == D_CONSISTENT &&
-           (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE))
+       rcu_read_lock();
+       if (ns.disk == D_CONSISTENT &&
+           (ns.pdsk == D_OUTDATED || rcu_dereference(mdev->ldev->disk_conf)->fencing == FP_DONT_CARE))
                ns.disk = D_UP_TO_DATE;
  
        /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
           this point, because drbd_request_state() modifies these
           flags. */
  
+       if (rcu_dereference(mdev->ldev->disk_conf)->al_updates)
+               mdev->ldev->md.flags &= ~MDF_AL_DISABLED;
+       else
+               mdev->ldev->md.flags |= MDF_AL_DISABLED;
+       rcu_read_unlock();
        /* In case we are C_CONNECTED postpone any decision on the new disk
           state after the negotiation phase. */
        if (mdev->state.conn == C_CONNECTED) {
        }
  
        rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
-       ns = mdev->state;
-       spin_unlock_irq(&mdev->req_lock);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
        if (rv < SS_SUCCESS)
                goto force_diskless_dec;
  
+       mod_timer(&mdev->request_timer, jiffies + HZ);
        if (mdev->state.role == R_PRIMARY)
                mdev->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
        else
  
        kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
        put_ldev(mdev);
-       reply->ret_code = retcode;
-       drbd_reconfig_done(mdev);
+       conn_reconfig_done(mdev->tconn);
+       drbd_adm_finish(info, retcode);
        return 0;
  
   force_diskless_dec:
        put_ldev(mdev);
   force_diskless:
-       drbd_force_state(mdev, NS(disk, D_FAILED));
+       drbd_force_state(mdev, NS(disk, D_DISKLESS));
        drbd_md_sync(mdev);
   fail:
+       conn_reconfig_done(mdev->tconn);
        if (nbc) {
                if (nbc->backing_bdev)
                        blkdev_put(nbc->backing_bdev,
                                   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
                kfree(nbc);
        }
+       kfree(new_disk_conf);
        lc_destroy(resync_lru);
+       kfree(new_plan);
  
-       reply->ret_code = retcode;
-       drbd_reconfig_done(mdev);
+  finish:
+       drbd_adm_finish(info, retcode);
        return 0;
  }
  
- /* Detaching the disk is a process in multiple stages.  First we need to lock
-  * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
-  * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
-  * internal references as well.
-  * Only then we have finally detached. */
- static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                         struct drbd_nl_cfg_reply *reply)
+ static int adm_detach(struct drbd_conf *mdev, int force)
  {
-       enum drbd_ret_code retcode;
+       enum drbd_state_rv retcode;
        int ret;
-       struct detach dt = {};
  
-       if (!detach_from_tags(mdev, nlp->tag_list, &dt)) {
-               reply->ret_code = ERR_MANDATORY_TAG;
-               goto out;
-       }
-       if (dt.detach_force) {
-               drbd_set_flag(mdev, FORCE_DETACH);
+       if (force) {
+               set_bit(FORCE_DETACH, &mdev->flags);
                drbd_force_state(mdev, NS(disk, D_FAILED));
-               reply->ret_code = SS_SUCCESS;
+               retcode = SS_SUCCESS;
                goto out;
        }
  
        ret = wait_event_interruptible(mdev->misc_wait,
                        mdev->state.disk != D_FAILED);
        drbd_resume_io(mdev);
        if ((int)retcode == (int)SS_IS_DISKLESS)
                retcode = SS_NOTHING_TO_DO;
        if (ret)
                retcode = ERR_INTR;
-       reply->ret_code = retcode;
  out:
-       return 0;
+       return retcode;
  }
  
- static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                           struct drbd_nl_cfg_reply *reply)
+ /* Detaching the disk is a process in multiple stages.  First we need to lock
+  * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
+  * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
+  * internal references as well.
+  * Only then we have finally detached. */
+ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
  {
-       int i, ns;
        enum drbd_ret_code retcode;
-       struct net_conf *new_conf = NULL;
-       struct crypto_hash *tfm = NULL;
-       struct crypto_hash *integrity_w_tfm = NULL;
-       struct crypto_hash *integrity_r_tfm = NULL;
-       struct hlist_head *new_tl_hash = NULL;
-       struct hlist_head *new_ee_hash = NULL;
-       struct drbd_conf *odev;
-       char hmac_name[CRYPTO_MAX_ALG_NAME];
-       void *int_dig_out = NULL;
-       void *int_dig_in = NULL;
-       void *int_dig_vv = NULL;
-       struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr;
+       struct detach_parms parms = { };
+       int err;
  
-       drbd_reconfig_start(mdev);
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
  
-       if (mdev->state.conn > C_STANDALONE) {
-               retcode = ERR_NET_CONFIGURED;
-               goto fail;
+       if (info->attrs[DRBD_NLA_DETACH_PARMS]) {
+               err = detach_parms_from_attrs(&parms, info);
+               if (err) {
+                       retcode = ERR_MANDATORY_TAG;
+                       drbd_msg_put_info(from_attrs_err_to_txt(err));
+                       goto out;
+               }
        }
  
-       /* allocation not in the IO path, cqueue thread context */
-       new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
-       if (!new_conf) {
-               retcode = ERR_NOMEM;
-               goto fail;
-       }
+       retcode = adm_detach(adm_ctx.mdev, parms.force_detach);
+ out:
+       drbd_adm_finish(info, retcode);
+       return 0;
+ }
  
-       new_conf->timeout          = DRBD_TIMEOUT_DEF;
-       new_conf->try_connect_int  = DRBD_CONNECT_INT_DEF;
-       new_conf->ping_int         = DRBD_PING_INT_DEF;
-       new_conf->max_epoch_size   = DRBD_MAX_EPOCH_SIZE_DEF;
-       new_conf->max_buffers      = DRBD_MAX_BUFFERS_DEF;
-       new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF;
-       new_conf->sndbuf_size      = DRBD_SNDBUF_SIZE_DEF;
-       new_conf->rcvbuf_size      = DRBD_RCVBUF_SIZE_DEF;
-       new_conf->ko_count         = DRBD_KO_COUNT_DEF;
-       new_conf->after_sb_0p      = DRBD_AFTER_SB_0P_DEF;
-       new_conf->after_sb_1p      = DRBD_AFTER_SB_1P_DEF;
-       new_conf->after_sb_2p      = DRBD_AFTER_SB_2P_DEF;
-       new_conf->want_lose        = 0;
-       new_conf->two_primaries    = 0;
-       new_conf->wire_protocol    = DRBD_PROT_C;
-       new_conf->ping_timeo       = DRBD_PING_TIMEO_DEF;
-       new_conf->rr_conflict      = DRBD_RR_CONFLICT_DEF;
-       new_conf->on_congestion    = DRBD_ON_CONGESTION_DEF;
-       new_conf->cong_extents     = DRBD_CONG_EXTENTS_DEF;
-       if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
-               retcode = ERR_MANDATORY_TAG;
-               goto fail;
+ static bool conn_resync_running(struct drbd_tconn *tconn)
+ {
+       struct drbd_conf *mdev;
+       bool rv = false;
+       int vnr;
+       rcu_read_lock();
+       idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+               if (mdev->state.conn == C_SYNC_SOURCE ||
+                   mdev->state.conn == C_SYNC_TARGET ||
+                   mdev->state.conn == C_PAUSED_SYNC_S ||
+                   mdev->state.conn == C_PAUSED_SYNC_T) {
+                       rv = true;
+                       break;
+               }
        }
+       rcu_read_unlock();
  
-       if (new_conf->two_primaries
-           && (new_conf->wire_protocol != DRBD_PROT_C)) {
-               retcode = ERR_NOT_PROTO_C;
-               goto fail;
-       }
+       return rv;
+ }
  
-       if (get_ldev(mdev)) {
-               enum drbd_fencing_p fp = mdev->ldev->dc.fencing;
-               put_ldev(mdev);
-               if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) {
-                       retcode = ERR_STONITH_AND_PROT_A;
-                       goto fail;
+ static bool conn_ov_running(struct drbd_tconn *tconn)
+ {
+       struct drbd_conf *mdev;
+       bool rv = false;
+       int vnr;
+       rcu_read_lock();
+       idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+               if (mdev->state.conn == C_VERIFY_S ||
+                   mdev->state.conn == C_VERIFY_T) {
+                       rv = true;
+                       break;
                }
        }
+       rcu_read_unlock();
  
-       if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) {
-               retcode = ERR_CONG_NOT_PROTO_A;
-               goto fail;
-       }
+       return rv;
+ }
  
-       if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
-               retcode = ERR_DISCARD;
-               goto fail;
-       }
+ static enum drbd_ret_code
+ _check_net_options(struct drbd_tconn *tconn, struct net_conf *old_conf, struct net_conf *new_conf)
+ {
+       struct drbd_conf *mdev;
+       int i;
  
-       retcode = NO_ERROR;
+       if (old_conf && tconn->cstate == C_WF_REPORT_PARAMS && tconn->agreed_pro_version < 100) {
+               if (new_conf->wire_protocol != old_conf->wire_protocol)
+                       return ERR_NEED_APV_100;
  
-       new_my_addr = (struct sockaddr *)&new_conf->my_addr;
-       new_peer_addr = (struct sockaddr *)&new_conf->peer_addr;
-       for (i = 0; i < minor_count; i++) {
-               odev = minor_to_mdev(i);
-               if (!odev || odev == mdev)
-                       continue;
-               if (get_net_conf(odev)) {
-                       taken_addr = (struct sockaddr *)&odev->net_conf->my_addr;
-                       if (new_conf->my_addr_len == odev->net_conf->my_addr_len &&
-                           !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len))
-                               retcode = ERR_LOCAL_ADDR;
-                       taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr;
-                       if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len &&
-                           !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len))
-                               retcode = ERR_PEER_ADDR;
-                       put_net_conf(odev);
-                       if (retcode != NO_ERROR)
-                               goto fail;
-               }
+               if (new_conf->two_primaries != old_conf->two_primaries)
+                       return ERR_NEED_APV_100;
 -              if (!new_conf->integrity_alg != !old_conf->integrity_alg)
 -                      return ERR_NEED_APV_100;
 -
+               if (strcmp(new_conf->integrity_alg, old_conf->integrity_alg))
+                       return ERR_NEED_APV_100;
        }
  
-       if (new_conf->cram_hmac_alg[0] != 0) {
-               snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
-                       new_conf->cram_hmac_alg);
-               tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC);
-               if (IS_ERR(tfm)) {
-                       tfm = NULL;
-                       retcode = ERR_AUTH_ALG;
-                       goto fail;
-               }
+       if (!new_conf->two_primaries &&
+           conn_highest_role(tconn) == R_PRIMARY &&
+           conn_highest_peer(tconn) == R_PRIMARY)
+               return ERR_NEED_ALLOW_TWO_PRI;
  
-               if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
-                       retcode = ERR_AUTH_ALG_ND;
-                       goto fail;
+       if (new_conf->two_primaries &&
+           (new_conf->wire_protocol != DRBD_PROT_C))
+               return ERR_NOT_PROTO_C;
+       idr_for_each_entry(&tconn->volumes, mdev, i) {
+               if (get_ldev(mdev)) {
+                       enum drbd_fencing_p fp = rcu_dereference(mdev->ldev->disk_conf)->fencing;
+                       put_ldev(mdev);
+                       if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH)
+                               return ERR_STONITH_AND_PROT_A;
                }
+               if (mdev->state.role == R_PRIMARY && new_conf->discard_my_data)
+                       return ERR_DISCARD_IMPOSSIBLE;
        }
  
-       if (new_conf->integrity_alg[0]) {
-               integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
-               if (IS_ERR(integrity_w_tfm)) {
-                       integrity_w_tfm = NULL;
-                       retcode=ERR_INTEGRITY_ALG;
-                       goto fail;
-               }
+       if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A)
+               return ERR_CONG_NOT_PROTO_A;
  
-               if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) {
-                       retcode=ERR_INTEGRITY_ALG_ND;
-                       goto fail;
-               }
+       return NO_ERROR;
+ }
  
-               integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
-               if (IS_ERR(integrity_r_tfm)) {
-                       integrity_r_tfm = NULL;
-                       retcode=ERR_INTEGRITY_ALG;
-                       goto fail;
-               }
-       }
+ static enum drbd_ret_code
+ check_net_options(struct drbd_tconn *tconn, struct net_conf *new_conf)
+ {
+       static enum drbd_ret_code rv;
+       struct drbd_conf *mdev;
+       int i;
  
-       ns = new_conf->max_epoch_size/8;
-       if (mdev->tl_hash_s != ns) {
-               new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
-               if (!new_tl_hash) {
-                       retcode = ERR_NOMEM;
-                       goto fail;
-               }
-       }
+       rcu_read_lock();
+       rv = _check_net_options(tconn, rcu_dereference(tconn->net_conf), new_conf);
+       rcu_read_unlock();
  
-       ns = new_conf->max_buffers/8;
-       if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
-               new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
-               if (!new_ee_hash) {
-                       retcode = ERR_NOMEM;
-                       goto fail;
+       /* tconn->volumes protected by genl_lock() here */
+       idr_for_each_entry(&tconn->volumes, mdev, i) {
+               if (!mdev->bitmap) {
+                       if(drbd_bm_init(mdev))
+                               return ERR_NOMEM;
                }
        }
  
-       ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
+       return rv;
+ }
  
-       if (integrity_w_tfm) {
-               i = crypto_hash_digestsize(integrity_w_tfm);
-               int_dig_out = kmalloc(i, GFP_KERNEL);
-               if (!int_dig_out) {
-                       retcode = ERR_NOMEM;
-                       goto fail;
-               }
-               int_dig_in = kmalloc(i, GFP_KERNEL);
-               if (!int_dig_in) {
-                       retcode = ERR_NOMEM;
-                       goto fail;
-               }
-               int_dig_vv = kmalloc(i, GFP_KERNEL);
-               if (!int_dig_vv) {
-                       retcode = ERR_NOMEM;
-                       goto fail;
-               }
-       }
+ struct crypto {
+       struct crypto_hash *verify_tfm;
+       struct crypto_hash *csums_tfm;
+       struct crypto_hash *cram_hmac_tfm;
+       struct crypto_hash *integrity_tfm;
+ };
  
-       if (!mdev->bitmap) {
-               if(drbd_bm_init(mdev)) {
-                       retcode = ERR_NOMEM;
-                       goto fail;
-               }
-       }
+ static int
+ alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg)
+ {
+       if (!tfm_name[0])
+               return NO_ERROR;
  
-       drbd_flush_workqueue(mdev);
-       spin_lock_irq(&mdev->req_lock);
-       if (mdev->net_conf != NULL) {
-               retcode = ERR_NET_CONFIGURED;
-               spin_unlock_irq(&mdev->req_lock);
-               goto fail;
+       *tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC);
+       if (IS_ERR(*tfm)) {
+               *tfm = NULL;
+               return err_alg;
        }
-       mdev->net_conf = new_conf;
  
-       mdev->send_cnt = 0;
-       mdev->recv_cnt = 0;
+       return NO_ERROR;
+ }
  
-       if (new_tl_hash) {
-               kfree(mdev->tl_hash);
-               mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8;
-               mdev->tl_hash = new_tl_hash;
-       }
+ static enum drbd_ret_code
+ alloc_crypto(struct crypto *crypto, struct net_conf *new_conf)
+ {
+       char hmac_name[CRYPTO_MAX_ALG_NAME];
+       enum drbd_ret_code rv;
+       rv = alloc_hash(&crypto->csums_tfm, new_conf->csums_alg,
+                      ERR_CSUMS_ALG);
+       if (rv != NO_ERROR)
+               return rv;
+       rv = alloc_hash(&crypto->verify_tfm, new_conf->verify_alg,
+                      ERR_VERIFY_ALG);
+       if (rv != NO_ERROR)
+               return rv;
+       rv = alloc_hash(&crypto->integrity_tfm, new_conf->integrity_alg,
+                      ERR_INTEGRITY_ALG);
+       if (rv != NO_ERROR)
+               return rv;
+       if (new_conf->cram_hmac_alg[0] != 0) {
+               snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
+                        new_conf->cram_hmac_alg);
  
-       if (new_ee_hash) {
-               kfree(mdev->ee_hash);
-               mdev->ee_hash_s = mdev->net_conf->max_buffers/8;
-               mdev->ee_hash = new_ee_hash;
+               rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name,
+                              ERR_AUTH_ALG);
        }
  
-       crypto_free_hash(mdev->cram_hmac_tfm);
-       mdev->cram_hmac_tfm = tfm;
+       return rv;
+ }
  
-       crypto_free_hash(mdev->integrity_w_tfm);
-       mdev->integrity_w_tfm = integrity_w_tfm;
+ static void free_crypto(struct crypto *crypto)
+ {
+       crypto_free_hash(crypto->cram_hmac_tfm);
+       crypto_free_hash(crypto->integrity_tfm);
+       crypto_free_hash(crypto->csums_tfm);
+       crypto_free_hash(crypto->verify_tfm);
+ }
  
-       crypto_free_hash(mdev->integrity_r_tfm);
-       mdev->integrity_r_tfm = integrity_r_tfm;
+ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
+ {
+       enum drbd_ret_code retcode;
+       struct drbd_tconn *tconn;
+       struct net_conf *old_conf, *new_conf = NULL;
+       int err;
+       int ovr; /* online verify running */
+       int rsr; /* re-sync running */
+       struct crypto crypto = { };
  
-       kfree(mdev->int_dig_out);
-       kfree(mdev->int_dig_in);
-       kfree(mdev->int_dig_vv);
-       mdev->int_dig_out=int_dig_out;
-       mdev->int_dig_in=int_dig_in;
-       mdev->int_dig_vv=int_dig_vv;
-       retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL);
-       spin_unlock_irq(&mdev->req_lock);
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
  
-       kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
-       reply->ret_code = retcode;
-       drbd_reconfig_done(mdev);
-       return 0;
+       tconn = adm_ctx.tconn;
  
- fail:
-       kfree(int_dig_out);
-       kfree(int_dig_in);
-       kfree(int_dig_vv);
-       crypto_free_hash(tfm);
-       crypto_free_hash(integrity_w_tfm);
-       crypto_free_hash(integrity_r_tfm);
-       kfree(new_tl_hash);
-       kfree(new_ee_hash);
-       kfree(new_conf);
+       new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+       if (!new_conf) {
+               retcode = ERR_NOMEM;
+               goto out;
+       }
  
-       reply->ret_code = retcode;
-       drbd_reconfig_done(mdev);
-       return 0;
- }
+       conn_reconfig_start(tconn);
  
- static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                             struct drbd_nl_cfg_reply *reply)
- {
-       int retcode;
-       struct disconnect dc;
+       mutex_lock(&tconn->data.mutex);
+       mutex_lock(&tconn->conf_update);
+       old_conf = tconn->net_conf;
  
-       memset(&dc, 0, sizeof(struct disconnect));
-       if (!disconnect_from_tags(mdev, nlp->tag_list, &dc)) {
-               retcode = ERR_MANDATORY_TAG;
+       if (!old_conf) {
+               drbd_msg_put_info("net conf missing, try connect");
+               retcode = ERR_INVALID_REQUEST;
                goto fail;
        }
  
-       if (dc.force) {
-               spin_lock_irq(&mdev->req_lock);
-               if (mdev->state.conn >= C_WF_CONNECTION)
-                       _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), CS_HARD, NULL);
-               spin_unlock_irq(&mdev->req_lock);
-               goto done;
+       *new_conf = *old_conf;
+       if (should_set_defaults(info))
+               set_net_conf_defaults(new_conf);
+       err = net_conf_from_attrs_for_change(new_conf, info);
+       if (err && err != -ENOMSG) {
+               retcode = ERR_MANDATORY_TAG;
+               drbd_msg_put_info(from_attrs_err_to_txt(err));
+               goto fail;
        }
  
-       retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
+       retcode = check_net_options(tconn, new_conf);
+       if (retcode != NO_ERROR)
+               goto fail;
  
-       if (retcode == SS_NOTHING_TO_DO)
-               goto done;
-       else if (retcode == SS_ALREADY_STANDALONE)
-               goto done;
-       else if (retcode == SS_PRIMARY_NOP) {
-               /* Our statche checking code wants to see the peer outdated. */
-               retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
-                                                     pdsk, D_OUTDATED));
-       } else if (retcode == SS_CW_FAILED_BY_PEER) {
-               /* The peer probably wants to see us outdated. */
-               retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
-                                                       disk, D_OUTDATED),
-                                             CS_ORDERED);
-               if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) {
-                       drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-                       retcode = SS_SUCCESS;
-               }
+       /* re-sync running */
+       rsr = conn_resync_running(tconn);
+       if (rsr && strcmp(new_conf->csums_alg, old_conf->csums_alg)) {
+               retcode = ERR_CSUMS_RESYNC_RUNNING;
+               goto fail;
        }
  
-       if (retcode < SS_SUCCESS)
+       /* online verify running */
+       ovr = conn_ov_running(tconn);
+       if (ovr && strcmp(new_conf->verify_alg, old_conf->verify_alg)) {
+               retcode = ERR_VERIFY_RUNNING;
                goto fail;
+       }
  
-       if (wait_event_interruptible(mdev->state_wait,
-                                    mdev->state.conn != C_DISCONNECTING)) {
-               /* Do not test for mdev->state.conn == C_STANDALONE, since
-                  someone else might connect us in the mean time! */
-               retcode = ERR_INTR;
+       retcode = alloc_crypto(&crypto, new_conf);
+       if (retcode != NO_ERROR)
                goto fail;
+       rcu_assign_pointer(tconn->net_conf, new_conf);
+       if (!rsr) {
+               crypto_free_hash(tconn->csums_tfm);
+               tconn->csums_tfm = crypto.csums_tfm;
+               crypto.csums_tfm = NULL;
+       }
+       if (!ovr) {
+               crypto_free_hash(tconn->verify_tfm);
+               tconn->verify_tfm = crypto.verify_tfm;
+               crypto.verify_tfm = NULL;
        }
  
-  done:
-       retcode = NO_ERROR;
+       crypto_free_hash(tconn->integrity_tfm);
+       tconn->integrity_tfm = crypto.integrity_tfm;
+       if (tconn->cstate >= C_WF_REPORT_PARAMS && tconn->agreed_pro_version >= 100)
+               /* Do this without trying to take tconn->data.mutex again.  */
+               __drbd_send_protocol(tconn, P_PROTOCOL_UPDATE);
+       crypto_free_hash(tconn->cram_hmac_tfm);
+       tconn->cram_hmac_tfm = crypto.cram_hmac_tfm;
+       mutex_unlock(&tconn->conf_update);
+       mutex_unlock(&tconn->data.mutex);
+       synchronize_rcu();
+       kfree(old_conf);
+       if (tconn->cstate >= C_WF_REPORT_PARAMS)
+               drbd_send_sync_param(minor_to_mdev(conn_lowest_minor(tconn)));
+       goto done;
   fail:
-       drbd_md_sync(mdev);
-       reply->ret_code = retcode;
+       mutex_unlock(&tconn->conf_update);
+       mutex_unlock(&tconn->data.mutex);
+       free_crypto(&crypto);
+       kfree(new_conf);
+  done:
+       conn_reconfig_done(tconn);
+  out:
+       drbd_adm_finish(info, retcode);
        return 0;
  }
  
void resync_after_online_grow(struct drbd_conf *mdev)
int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
  {
-       int iass; /* I am sync source */
+       struct drbd_conf *mdev;
+       struct net_conf *old_conf, *new_conf = NULL;
+       struct crypto crypto = { };
+       struct drbd_tconn *tconn;
+       enum drbd_ret_code retcode;
+       int i;
+       int err;
  
-       dev_info(DEV, "Resync of new storage after online grow\n");
-       if (mdev->state.role != mdev->state.peer)
-               iass = (mdev->state.role == R_PRIMARY);
-       else
-               iass = drbd_test_flag(mdev, DISCARD_CONCURRENT);
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
  
-       if (iass)
-               drbd_start_resync(mdev, C_SYNC_SOURCE);
-       else
-               _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
- }
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
+       if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) {
+               drbd_msg_put_info("connection endpoint(s) missing");
+               retcode = ERR_INVALID_REQUEST;
+               goto out;
+       }
  
- static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                         struct drbd_nl_cfg_reply *reply)
- {
-       struct resize rs;
-       int retcode = NO_ERROR;
-       enum determine_dev_size dd;
-       enum dds_flags ddsf;
+       /* No need for _rcu here. All reconfiguration is
+        * strictly serialized on genl_lock(). We are protected against
+        * concurrent reconfiguration/addition/deletion */
+       list_for_each_entry(tconn, &drbd_tconns, all_tconn) {
+               if (nla_len(adm_ctx.my_addr) == tconn->my_addr_len &&
+                   !memcmp(nla_data(adm_ctx.my_addr), &tconn->my_addr, tconn->my_addr_len)) {
+                       retcode = ERR_LOCAL_ADDR;
+                       goto out;
+               }
  
-       memset(&rs, 0, sizeof(struct resize));
-       if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
-               retcode = ERR_MANDATORY_TAG;
-               goto fail;
+               if (nla_len(adm_ctx.peer_addr) == tconn->peer_addr_len &&
+                   !memcmp(nla_data(adm_ctx.peer_addr), &tconn->peer_addr, tconn->peer_addr_len)) {
+                       retcode = ERR_PEER_ADDR;
+                       goto out;
+               }
        }
  
-       if (mdev->state.conn > C_CONNECTED) {
-               retcode = ERR_RESIZE_RESYNC;
-               goto fail;
-       }
+       tconn = adm_ctx.tconn;
+       conn_reconfig_start(tconn);
  
-       if (mdev->state.role == R_SECONDARY &&
-           mdev->state.peer == R_SECONDARY) {
-               retcode = ERR_NO_PRIMARY;
+       if (tconn->cstate > C_STANDALONE) {
+               retcode = ERR_NET_CONFIGURED;
                goto fail;
        }
  
-       if (!get_ldev(mdev)) {
-               retcode = ERR_NO_DISK;
+       /* allocation not in the IO path, drbdsetup / netlink process context */
+       new_conf = kzalloc(sizeof(*new_conf), GFP_KERNEL);
+       if (!new_conf) {
+               retcode = ERR_NOMEM;
                goto fail;
        }
  
-       if (rs.no_resync && mdev->agreed_pro_version < 93) {
-               retcode = ERR_NEED_APV_93;
-               goto fail_ldev;
-       }
-       if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
-               mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
+       set_net_conf_defaults(new_conf);
  
-       mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
-       ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
-       dd = drbd_determine_dev_size(mdev, ddsf);
-       drbd_md_sync(mdev);
-       put_ldev(mdev);
-       if (dd == dev_size_error) {
-               retcode = ERR_NOMEM_BITMAP;
+       err = net_conf_from_attrs(new_conf, info);
+       if (err && err != -ENOMSG) {
+               retcode = ERR_MANDATORY_TAG;
+               drbd_msg_put_info(from_attrs_err_to_txt(err));
                goto fail;
        }
  
-       if (mdev->state.conn == C_CONNECTED) {
-               if (dd == grew)
-                       drbd_set_flag(mdev, RESIZE_PENDING);
+       retcode = check_net_options(tconn, new_conf);
+       if (retcode != NO_ERROR)
+               goto fail;
  
-               drbd_send_uuids(mdev);
-               drbd_send_sizes(mdev, 1, ddsf);
-       }
+       retcode = alloc_crypto(&crypto, new_conf);
+       if (retcode != NO_ERROR)
+               goto fail;
  
-  fail:
-       reply->ret_code = retcode;
-       return 0;
+       ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
  
-  fail_ldev:
-       put_ldev(mdev);
-       goto fail;
- }
+       conn_flush_workqueue(tconn);
  
- static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                              struct drbd_nl_cfg_reply *reply)
- {
-       int retcode = NO_ERROR;
-       int err;
-       int ovr; /* online verify running */
-       int rsr; /* re-sync running */
-       struct crypto_hash *verify_tfm = NULL;
-       struct crypto_hash *csums_tfm = NULL;
-       struct syncer_conf sc;
-       cpumask_var_t new_cpu_mask;
-       int *rs_plan_s = NULL;
-       int fifo_size;
-       if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
-               retcode = ERR_NOMEM;
+       mutex_lock(&tconn->conf_update);
+       old_conf = tconn->net_conf;
+       if (old_conf) {
+               retcode = ERR_NET_CONFIGURED;
+               mutex_unlock(&tconn->conf_update);
                goto fail;
        }
+       rcu_assign_pointer(tconn->net_conf, new_conf);
  
-       if (nlp->flags & DRBD_NL_SET_DEFAULTS) {
-               memset(&sc, 0, sizeof(struct syncer_conf));
-               sc.rate       = DRBD_RATE_DEF;
-               sc.after      = DRBD_AFTER_DEF;
-               sc.al_extents = DRBD_AL_EXTENTS_DEF;
-               sc.on_no_data  = DRBD_ON_NO_DATA_DEF;
-               sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF;
-               sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF;
-               sc.c_fill_target = DRBD_C_FILL_TARGET_DEF;
-               sc.c_max_rate = DRBD_C_MAX_RATE_DEF;
-               sc.c_min_rate = DRBD_C_MIN_RATE_DEF;
-       } else
-               memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
+       conn_free_crypto(tconn);
+       tconn->cram_hmac_tfm = crypto.cram_hmac_tfm;
+       tconn->integrity_tfm = crypto.integrity_tfm;
+       tconn->csums_tfm = crypto.csums_tfm;
+       tconn->verify_tfm = crypto.verify_tfm;
  
-       if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) {
-               retcode = ERR_MANDATORY_TAG;
-               goto fail;
-       }
+       tconn->my_addr_len = nla_len(adm_ctx.my_addr);
+       memcpy(&tconn->my_addr, nla_data(adm_ctx.my_addr), tconn->my_addr_len);
+       tconn->peer_addr_len = nla_len(adm_ctx.peer_addr);
+       memcpy(&tconn->peer_addr, nla_data(adm_ctx.peer_addr), tconn->peer_addr_len);
  
-       /* re-sync running */
-       rsr = ( mdev->state.conn == C_SYNC_SOURCE ||
-               mdev->state.conn == C_SYNC_TARGET ||
-               mdev->state.conn == C_PAUSED_SYNC_S ||
-               mdev->state.conn == C_PAUSED_SYNC_T );
+       mutex_unlock(&tconn->conf_update);
  
-       if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) {
-               retcode = ERR_CSUMS_RESYNC_RUNNING;
-               goto fail;
+       rcu_read_lock();
+       idr_for_each_entry(&tconn->volumes, mdev, i) {
+               mdev->send_cnt = 0;
+               mdev->recv_cnt = 0;
        }
+       rcu_read_unlock();
  
-       if (!rsr && sc.csums_alg[0]) {
-               csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC);
-               if (IS_ERR(csums_tfm)) {
-                       csums_tfm = NULL;
-                       retcode = ERR_CSUMS_ALG;
-                       goto fail;
-               }
+       retcode = conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE);
  
-               if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) {
-                       retcode = ERR_CSUMS_ALG_ND;
-                       goto fail;
-               }
-       }
+       conn_reconfig_done(tconn);
+       drbd_adm_finish(info, retcode);
+       return 0;
  
-       /* online verify running */
-       ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T);
+ fail:
+       free_crypto(&crypto);
+       kfree(new_conf);
  
-       if (ovr) {
-               if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) {
-                       retcode = ERR_VERIFY_RUNNING;
-                       goto fail;
+       conn_reconfig_done(tconn);
+ out:
+       drbd_adm_finish(info, retcode);
+       return 0;
+ }
+ static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool force)
+ {
+       enum drbd_state_rv rv;
+       rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING),
+                       force ? CS_HARD : 0);
+       switch (rv) {
+       case SS_NOTHING_TO_DO:
+               break;
+       case SS_ALREADY_STANDALONE:
+               return SS_SUCCESS;
+       case SS_PRIMARY_NOP:
+               /* Our state checking code wants to see the peer outdated. */
+               rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING,
+                                               pdsk, D_OUTDATED), CS_VERBOSE);
+               break;
+       case SS_CW_FAILED_BY_PEER:
+               /* The peer probably wants to see us outdated. */
+               rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING,
+                                                       disk, D_OUTDATED), 0);
+               if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) {
+                       rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING),
+                                       CS_HARD);
                }
+               break;
+       default:;
+               /* no special handling necessary */
+       }
+       if (rv >= SS_SUCCESS) {
+               enum drbd_state_rv rv2;
+               /* No one else can reconfigure the network while I am here.
+                * The state handling only uses drbd_thread_stop_nowait(),
+                * we want to really wait here until the receiver is no more.
+                */
+               drbd_thread_stop(&adm_ctx.tconn->receiver);
+               /* Race breaker.  This additional state change request may be
+                * necessary, if this was a forced disconnect during a receiver
+                * restart.  We may have "killed" the receiver thread just
+                * after drbdd_init() returned.  Typically, we should be
+                * C_STANDALONE already, now, and this becomes a no-op.
+                */
+               rv2 = conn_request_state(tconn, NS(conn, C_STANDALONE),
+                               CS_VERBOSE | CS_HARD);
+               if (rv2 < SS_SUCCESS)
+                       conn_err(tconn,
+                               "unexpected rv2=%d in conn_try_disconnect()\n",
+                               rv2);
        }
+       return rv;
+ }
  
-       if (!ovr && sc.verify_alg[0]) {
-               verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC);
-               if (IS_ERR(verify_tfm)) {
-                       verify_tfm = NULL;
-                       retcode = ERR_VERIFY_ALG;
-                       goto fail;
-               }
+ int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
+ {
+       struct disconnect_parms parms;
+       struct drbd_tconn *tconn;
+       enum drbd_state_rv rv;
+       enum drbd_ret_code retcode;
+       int err;
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto fail;
  
-               if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) {
-                       retcode = ERR_VERIFY_ALG_ND;
+       tconn = adm_ctx.tconn;
+       memset(&parms, 0, sizeof(parms));
+       if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) {
+               err = disconnect_parms_from_attrs(&parms, info);
+               if (err) {
+                       retcode = ERR_MANDATORY_TAG;
+                       drbd_msg_put_info(from_attrs_err_to_txt(err));
                        goto fail;
                }
        }
  
-       /* silently ignore cpu mask on UP kernel */
-       if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) {
-               err = bitmap_parse(sc.cpu_mask, 32,
-                               cpumask_bits(new_cpu_mask), nr_cpu_ids);
+       rv = conn_try_disconnect(tconn, parms.force_disconnect);
+       if (rv < SS_SUCCESS)
+               retcode = rv;  /* FIXME: Type mismatch. */
+       else
+               retcode = NO_ERROR;
+  fail:
+       drbd_adm_finish(info, retcode);
+       return 0;
+ }
+ void resync_after_online_grow(struct drbd_conf *mdev)
+ {
+       int iass; /* I am sync source */
+       dev_info(DEV, "Resync of new storage after online grow\n");
+       if (mdev->state.role != mdev->state.peer)
+               iass = (mdev->state.role == R_PRIMARY);
+       else
+               iass = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags);
+       if (iass)
+               drbd_start_resync(mdev, C_SYNC_SOURCE);
+       else
+               _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
+ }
+ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
+ {
+       struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+       struct resize_parms rs;
+       struct drbd_conf *mdev;
+       enum drbd_ret_code retcode;
+       enum determine_dev_size dd;
+       enum dds_flags ddsf;
+       sector_t u_size;
+       int err;
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto fail;
+       memset(&rs, 0, sizeof(struct resize_parms));
+       if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
+               err = resize_parms_from_attrs(&rs, info);
                if (err) {
-                       dev_warn(DEV, "bitmap_parse() failed with %d\n", err);
-                       retcode = ERR_CPU_MASK_PARSE;
+                       retcode = ERR_MANDATORY_TAG;
+                       drbd_msg_put_info(from_attrs_err_to_txt(err));
                        goto fail;
                }
        }
  
-       ERR_IF (sc.rate < 1) sc.rate = 1;
-       ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */
- #define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT)
-       if (sc.al_extents > AL_MAX) {
-               dev_err(DEV, "sc.al_extents > %d\n", AL_MAX);
-               sc.al_extents = AL_MAX;
+       mdev = adm_ctx.mdev;
+       if (mdev->state.conn > C_CONNECTED) {
+               retcode = ERR_RESIZE_RESYNC;
+               goto fail;
        }
- #undef AL_MAX
  
-       /* to avoid spurious errors when configuring minors before configuring
-        * the minors they depend on: if necessary, first create the minor we
-        * depend on */
-       if (sc.after >= 0)
-               ensure_mdev(sc.after, 1);
+       if (mdev->state.role == R_SECONDARY &&
+           mdev->state.peer == R_SECONDARY) {
+               retcode = ERR_NO_PRIMARY;
+               goto fail;
+       }
  
-       /* most sanity checks done, try to assign the new sync-after
-        * dependency.  need to hold the global lock in there,
-        * to avoid a race in the dependency loop check. */
-       retcode = drbd_alter_sa(mdev, sc.after);
-       if (retcode != NO_ERROR)
+       if (!get_ldev(mdev)) {
+               retcode = ERR_NO_DISK;
                goto fail;
+       }
  
-       fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
-       if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
-               rs_plan_s   = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
-               if (!rs_plan_s) {
-                       dev_err(DEV, "kmalloc of fifo_buffer failed");
+       if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) {
+               retcode = ERR_NEED_APV_93;
+               goto fail_ldev;
+       }
+       rcu_read_lock();
+       u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+       rcu_read_unlock();
+       if (u_size != (sector_t)rs.resize_size) {
+               new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+               if (!new_disk_conf) {
                        retcode = ERR_NOMEM;
-                       goto fail;
+                       goto fail_ldev;
                }
        }
  
-       /* ok, assign the rest of it as well.
-        * lock against receive_SyncParam() */
-       spin_lock(&mdev->peer_seq_lock);
-       mdev->sync_conf = sc;
+       if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
+               mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
  
-       if (!rsr) {
-               crypto_free_hash(mdev->csums_tfm);
-               mdev->csums_tfm = csums_tfm;
-               csums_tfm = NULL;
+       if (new_disk_conf) {
+               mutex_lock(&mdev->tconn->conf_update);
+               old_disk_conf = mdev->ldev->disk_conf;
+               *new_disk_conf = *old_disk_conf;
+               new_disk_conf->disk_size = (sector_t)rs.resize_size;
+               rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
+               mutex_unlock(&mdev->tconn->conf_update);
+               synchronize_rcu();
+               kfree(old_disk_conf);
        }
  
-       if (!ovr) {
-               crypto_free_hash(mdev->verify_tfm);
-               mdev->verify_tfm = verify_tfm;
-               verify_tfm = NULL;
+       ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
+       dd = drbd_determine_dev_size(mdev, ddsf);
+       drbd_md_sync(mdev);
+       put_ldev(mdev);
+       if (dd == dev_size_error) {
+               retcode = ERR_NOMEM_BITMAP;
+               goto fail;
        }
  
-       if (fifo_size != mdev->rs_plan_s.size) {
-               kfree(mdev->rs_plan_s.values);
-               mdev->rs_plan_s.values = rs_plan_s;
-               mdev->rs_plan_s.size   = fifo_size;
-               mdev->rs_planed = 0;
-               rs_plan_s = NULL;
+       if (mdev->state.conn == C_CONNECTED) {
+               if (dd == grew)
+                       set_bit(RESIZE_PENDING, &mdev->flags);
+               drbd_send_uuids(mdev);
+               drbd_send_sizes(mdev, 1, ddsf);
        }
  
-       spin_unlock(&mdev->peer_seq_lock);
+  fail:
+       drbd_adm_finish(info, retcode);
+       return 0;
  
-       if (get_ldev(mdev)) {
-               wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
-               drbd_al_shrink(mdev);
-               err = drbd_check_al_size(mdev);
-               lc_unlock(mdev->act_log);
-               wake_up(&mdev->al_wait);
+  fail_ldev:
+       put_ldev(mdev);
+       goto fail;
+ }
  
-               put_ldev(mdev);
-               drbd_md_sync(mdev);
+ int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
+ {
+       enum drbd_ret_code retcode;
+       struct drbd_tconn *tconn;
+       struct res_opts res_opts;
+       int err;
  
-               if (err) {
-                       retcode = ERR_NOMEM;
-                       goto fail;
-               }
-       }
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto fail;
+       tconn = adm_ctx.tconn;
  
-       if (mdev->state.conn >= C_CONNECTED)
-               drbd_send_sync_param(mdev, &sc);
+       res_opts = tconn->res_opts;
+       if (should_set_defaults(info))
+               set_res_opts_defaults(&res_opts);
  
-       if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) {
-               cpumask_copy(mdev->cpu_mask, new_cpu_mask);
-               drbd_calc_cpu_mask(mdev);
-               mdev->receiver.reset_cpu_mask = 1;
-               mdev->asender.reset_cpu_mask = 1;
-               mdev->worker.reset_cpu_mask = 1;
+       err = res_opts_from_attrs(&res_opts, info);
+       if (err && err != -ENOMSG) {
+               retcode = ERR_MANDATORY_TAG;
+               drbd_msg_put_info(from_attrs_err_to_txt(err));
+               goto fail;
+       }
+       err = set_resource_options(tconn, &res_opts);
+       if (err) {
+               retcode = ERR_INVALID_REQUEST;
+               if (err == -ENOMEM)
+                       retcode = ERR_NOMEM;
        }
  
-       kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
  fail:
-       kfree(rs_plan_s);
-       free_cpumask_var(new_cpu_mask);
-       crypto_free_hash(csums_tfm);
-       crypto_free_hash(verify_tfm);
-       reply->ret_code = retcode;
+       drbd_adm_finish(info, retcode);
        return 0;
  }
  
- static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                             struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
  {
-       int retcode;
+       struct drbd_conf *mdev;
+       int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
+       mdev = adm_ctx.mdev;
  
        /* If there is still bitmap IO pending, probably because of a previous
         * resync just being finished, wait for it before requesting a new resync.
         * Also wait for it's after_state_ch(). */
        drbd_suspend_io(mdev);
-       wait_event(mdev->misc_wait, !drbd_test_flag(mdev, BITMAP_IO));
+       wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
        drbd_flush_workqueue(mdev);
  
        retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
                retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
  
        while (retcode == SS_NEED_CONNECTION) {
-               spin_lock_irq(&mdev->req_lock);
+               spin_lock_irq(&mdev->tconn->req_lock);
                if (mdev->state.conn < C_CONNECTED)
                        retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
-               spin_unlock_irq(&mdev->req_lock);
+               spin_unlock_irq(&mdev->tconn->req_lock);
  
                if (retcode != SS_NEED_CONNECTION)
                        break;
        }
        drbd_resume_io(mdev);
  
-       reply->ret_code = retcode;
+ out:
+       drbd_adm_finish(info, retcode);
+       return 0;
+ }
+ static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info,
+               union drbd_state mask, union drbd_state val)
+ {
+       enum drbd_ret_code retcode;
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
+       retcode = drbd_request_state(adm_ctx.mdev, mask, val);
+ out:
+       drbd_adm_finish(info, retcode);
        return 0;
  }
  
@@@ -2019,29 -2453,36 +2450,36 @@@ static int drbd_bmio_set_susp_al(struc
        return rv;
  }
  
- static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                                  struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
  {
-       int retcode;
+       int retcode; /* drbd_ret_code, drbd_state_rv */
+       struct drbd_conf *mdev;
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
+       mdev = adm_ctx.mdev;
  
        /* If there is still bitmap IO pending, probably because of a previous
         * resync just being finished, wait for it before requesting a new resync.
         * Also wait for it's after_state_ch(). */
        drbd_suspend_io(mdev);
-       wait_event(mdev->misc_wait, !drbd_test_flag(mdev, BITMAP_IO));
+       wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
        drbd_flush_workqueue(mdev);
  
        retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
        if (retcode < SS_SUCCESS) {
                if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) {
-                       /* The peer will get a resync upon connect anyways. Just make that
-                          into a full resync. */
+                       /* The peer will get a resync upon connect anyways.
+                        * Just make that into a full resync. */
                        retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
                        if (retcode >= SS_SUCCESS) {
                                if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
-                                       "set_n_write from invalidate_peer",
-                                       BM_LOCKED_SET_ALLOWED))
+                                                  "set_n_write from invalidate_peer",
+                                                  BM_LOCKED_SET_ALLOWED))
                                        retcode = ERR_IO_MD_DISK;
                        }
                } else
        }
        drbd_resume_io(mdev);
  
-       reply->ret_code = retcode;
+ out:
+       drbd_adm_finish(info, retcode);
        return 0;
  }
  
- static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                             struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
  {
-       int retcode = NO_ERROR;
+       enum drbd_ret_code retcode;
  
-       if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
-               retcode = ERR_PAUSE_IS_SET;
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
  
-       reply->ret_code = retcode;
+       if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
+               retcode = ERR_PAUSE_IS_SET;
+ out:
+       drbd_adm_finish(info, retcode);
        return 0;
  }
  
- static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                              struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
  {
-       int retcode = NO_ERROR;
-       union drbd_state s;
+       union drbd_dev_state s;
+       enum drbd_ret_code retcode;
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
  
-       if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
-               s = mdev->state;
+       if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
+               s = adm_ctx.mdev->state;
                if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
                        retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
                                  s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
                }
        }
  
-       reply->ret_code = retcode;
+ out:
+       drbd_adm_finish(info, retcode);
        return 0;
  }
  
- static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                             struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info)
  {
-       reply->ret_code = drbd_request_state(mdev, NS(susp, 1));
-       return 0;
+       return drbd_adm_simple_request_state(skb, info, NS(susp, 1));
  }
  
- static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                            struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
  {
-       if (drbd_test_flag(mdev, NEW_CUR_UUID)) {
+       struct drbd_conf *mdev;
+       int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
+       mdev = adm_ctx.mdev;
+       if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
                drbd_uuid_new_current(mdev);
-               drbd_clear_flag(mdev, NEW_CUR_UUID);
+               clear_bit(NEW_CUR_UUID, &mdev->flags);
        }
        drbd_suspend_io(mdev);
-       reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
-       if (reply->ret_code == SS_SUCCESS) {
+       retcode = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
+       if (retcode == SS_SUCCESS) {
                if (mdev->state.conn < C_CONNECTED)
-                       tl_clear(mdev);
+                       tl_clear(mdev->tconn);
                if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED)
-                       tl_restart(mdev, fail_frozen_disk_io);
+                       tl_restart(mdev->tconn, FAIL_FROZEN_DISK_IO);
        }
        drbd_resume_io(mdev);
  
+ out:
+       drbd_adm_finish(info, retcode);
        return 0;
  }
  
- static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                          struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info)
  {
-       reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED));
-       return 0;
+       return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED));
  }
  
- static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                          struct drbd_nl_cfg_reply *reply)
+ int nla_put_drbd_cfg_context(struct sk_buff *skb, struct drbd_tconn *tconn, unsigned vnr)
  {
-       unsigned short *tl;
+       struct nlattr *nla;
+       nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT);
+       if (!nla)
+               goto nla_put_failure;
+       if (vnr != VOLUME_UNSPECIFIED &&
+           nla_put_u32(skb, T_ctx_volume, vnr))
+               goto nla_put_failure;
+       if (nla_put_string(skb, T_ctx_resource_name, tconn->name))
+               goto nla_put_failure;
+       if (tconn->my_addr_len &&
+           nla_put(skb, T_ctx_my_addr, tconn->my_addr_len, &tconn->my_addr))
+               goto nla_put_failure;
+       if (tconn->peer_addr_len &&
+           nla_put(skb, T_ctx_peer_addr, tconn->peer_addr_len, &tconn->peer_addr))
+               goto nla_put_failure;
+       nla_nest_end(skb, nla);
+       return 0;
  
-       tl = reply->tag_list;
+ nla_put_failure:
+       if (nla)
+               nla_nest_cancel(skb, nla);
+       return -EMSGSIZE;
+ }
  
-       if (get_ldev(mdev)) {
-               tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl);
-               put_ldev(mdev);
-       }
+ int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev,
+               const struct sib_info *sib)
+ {
+       struct state_info *si = NULL; /* for sizeof(si->member); */
+       struct net_conf *nc;
+       struct nlattr *nla;
+       int got_ldev;
+       int err = 0;
+       int exclude_sensitive;
+       /* If sib != NULL, this is drbd_bcast_event, which anyone can listen
+        * to.  So we better exclude_sensitive information.
+        *
+        * If sib == NULL, this is drbd_adm_get_status, executed synchronously
+        * in the context of the requesting user process. Exclude sensitive
+        * information, unless current has superuser.
+        *
+        * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and
+        * relies on the current implementation of netlink_dump(), which
+        * executes the dump callback successively from netlink_recvmsg(),
+        * always in the context of the receiving process */
+       exclude_sensitive = sib || !capable(CAP_SYS_ADMIN);
+       got_ldev = get_ldev(mdev);
+       /* We need to add connection name and volume number information still.
+        * Minor number is in drbd_genlmsghdr. */
+       if (nla_put_drbd_cfg_context(skb, mdev->tconn, mdev->vnr))
+               goto nla_put_failure;
+       if (res_opts_to_skb(skb, &mdev->tconn->res_opts, exclude_sensitive))
+               goto nla_put_failure;
+       rcu_read_lock();
+       if (got_ldev)
+               if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive))
+                       goto nla_put_failure;
+       nc = rcu_dereference(mdev->tconn->net_conf);
+       if (nc)
+               err = net_conf_to_skb(skb, nc, exclude_sensitive);
+       rcu_read_unlock();
+       if (err)
+               goto nla_put_failure;
+       nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO);
+       if (!nla)
+               goto nla_put_failure;
+       if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
+           nla_put_u32(skb, T_current_state, mdev->state.i) ||
+           nla_put_u64(skb, T_ed_uuid, mdev->ed_uuid) ||
+           nla_put_u64(skb, T_capacity, drbd_get_capacity(mdev->this_bdev)) ||
+           nla_put_u64(skb, T_send_cnt, mdev->send_cnt) ||
+           nla_put_u64(skb, T_recv_cnt, mdev->recv_cnt) ||
+           nla_put_u64(skb, T_read_cnt, mdev->read_cnt) ||
+           nla_put_u64(skb, T_writ_cnt, mdev->writ_cnt) ||
+           nla_put_u64(skb, T_al_writ_cnt, mdev->al_writ_cnt) ||
+           nla_put_u64(skb, T_bm_writ_cnt, mdev->bm_writ_cnt) ||
+           nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&mdev->ap_bio_cnt)) ||
+           nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&mdev->ap_pending_cnt)) ||
+           nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&mdev->rs_pending_cnt)))
+               goto nla_put_failure;
+       if (got_ldev) {
+               int err;
  
-       if (get_net_conf(mdev)) {
-               tl = net_conf_to_tags(mdev, mdev->net_conf, tl);
-               put_net_conf(mdev);
+               spin_lock_irq(&mdev->ldev->md.uuid_lock);
+               err = nla_put(skb, T_uuids, sizeof(si->uuids), mdev->ldev->md.uuid);
+               spin_unlock_irq(&mdev->ldev->md.uuid_lock);
+               if (err)
+                       goto nla_put_failure;
+               if (nla_put_u32(skb, T_disk_flags, mdev->ldev->md.flags) ||
+                   nla_put_u64(skb, T_bits_total, drbd_bm_bits(mdev)) ||
+                   nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(mdev)))
+                       goto nla_put_failure;
+               if (C_SYNC_SOURCE <= mdev->state.conn &&
+                   C_PAUSED_SYNC_T >= mdev->state.conn) {
+                       if (nla_put_u64(skb, T_bits_rs_total, mdev->rs_total) ||
+                           nla_put_u64(skb, T_bits_rs_failed, mdev->rs_failed))
+                               goto nla_put_failure;
+               }
        }
-       tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl);
  
-       put_unaligned(TT_END, tl++); /* Close the tag list */
+       if (sib) {
+               switch(sib->sib_reason) {
+               case SIB_SYNC_PROGRESS:
+               case SIB_GET_STATUS_REPLY:
+                       break;
+               case SIB_STATE_CHANGE:
+                       if (nla_put_u32(skb, T_prev_state, sib->os.i) ||
+                           nla_put_u32(skb, T_new_state, sib->ns.i))
+                               goto nla_put_failure;
+                       break;
+               case SIB_HELPER_POST:
+                       if (nla_put_u32(skb, T_helper_exit_code,
+                                       sib->helper_exit_code))
+                               goto nla_put_failure;
+                       /* fall through */
+               case SIB_HELPER_PRE:
+                       if (nla_put_string(skb, T_helper, sib->helper_name))
+                               goto nla_put_failure;
+                       break;
+               }
+       }
+       nla_nest_end(skb, nla);
  
-       return (int)((char *)tl - (char *)reply->tag_list);
+       if (0)
+ nla_put_failure:
+               err = -EMSGSIZE;
+       if (got_ldev)
+               put_ldev(mdev);
+       return err;
  }
  
- static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                            struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
  {
-       unsigned short *tl = reply->tag_list;
-       union drbd_state s = mdev->state;
-       unsigned long rs_left;
-       unsigned int res;
+       enum drbd_ret_code retcode;
+       int err;
  
-       tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
  
-       /* no local ref, no bitmap, no syncer progress. */
-       if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) {
-               if (get_ldev(mdev)) {
-                       drbd_get_syncer_progress(mdev, &rs_left, &res);
-                       tl = tl_add_int(tl, T_sync_progress, &res);
-                       put_ldev(mdev);
-               }
+       err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.mdev, NULL);
+       if (err) {
+               nlmsg_free(adm_ctx.reply_skb);
+               return err;
        }
-       put_unaligned(TT_END, tl++); /* Close the tag list */
-       return (int)((char *)tl - (char *)reply->tag_list);
+ out:
+       drbd_adm_finish(info, retcode);
+       return 0;
  }
  
- static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                            struct drbd_nl_cfg_reply *reply)
+ int get_one_status(struct sk_buff *skb, struct netlink_callback *cb)
  {
-       unsigned short *tl;
-       tl = reply->tag_list;
+       struct drbd_conf *mdev;
+       struct drbd_genlmsghdr *dh;
+       struct drbd_tconn *pos = (struct drbd_tconn*)cb->args[0];
+       struct drbd_tconn *tconn = NULL;
+       struct drbd_tconn *tmp;
+       unsigned volume = cb->args[1];
+       /* Open coded, deferred, iteration:
+        * list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) {
+        *      idr_for_each_entry(&tconn->volumes, mdev, i) {
+        *        ...
+        *      }
+        * }
+        * where tconn is cb->args[0];
+        * and i is cb->args[1];
+        *
+        * cb->args[2] indicates if we shall loop over all resources,
+        * or just dump all volumes of a single resource.
+        *
+        * This may miss entries inserted after this dump started,
+        * or entries deleted before they are reached.
+        *
+        * We need to make sure the mdev won't disappear while
+        * we are looking at it, and revalidate our iterators
+        * on each iteration.
+        */
  
-       if (get_ldev(mdev)) {
-               unsigned long flags;
-               spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
-               tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64));
-               tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags);
-               spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
-               put_ldev(mdev);
+       /* synchronize with conn_create()/conn_destroy() */
+       rcu_read_lock();
+       /* revalidate iterator position */
+       list_for_each_entry_rcu(tmp, &drbd_tconns, all_tconn) {
+               if (pos == NULL) {
+                       /* first iteration */
+                       pos = tmp;
+                       tconn = pos;
+                       break;
+               }
+               if (tmp == pos) {
+                       tconn = pos;
+                       break;
+               }
        }
-       put_unaligned(TT_END, tl++); /* Close the tag list */
+       if (tconn) {
+ next_tconn:
+               mdev = idr_get_next(&tconn->volumes, &volume);
+               if (!mdev) {
+                       /* No more volumes to dump on this tconn.
+                        * Advance tconn iterator. */
+                       pos = list_entry_rcu(tconn->all_tconn.next,
+                                            struct drbd_tconn, all_tconn);
+                       /* Did we dump any volume on this tconn yet? */
+                       if (volume != 0) {
+                               /* If we reached the end of the list,
+                                * or only a single resource dump was requested,
+                                * we are done. */
+                               if (&pos->all_tconn == &drbd_tconns || cb->args[2])
+                                       goto out;
+                               volume = 0;
+                               tconn = pos;
+                               goto next_tconn;
+                       }
+               }
 -              dh = genlmsg_put(skb, NETLINK_CB(cb->skb).pid,
++              dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+                               cb->nlh->nlmsg_seq, &drbd_genl_family,
+                               NLM_F_MULTI, DRBD_ADM_GET_STATUS);
+               if (!dh)
+                       goto out;
+               if (!mdev) {
+                       /* This is a tconn without a single volume.
+                        * Suprisingly enough, it may have a network
+                        * configuration. */
+                       struct net_conf *nc;
+                       dh->minor = -1U;
+                       dh->ret_code = NO_ERROR;
+                       if (nla_put_drbd_cfg_context(skb, tconn, VOLUME_UNSPECIFIED))
+                               goto cancel;
+                       nc = rcu_dereference(tconn->net_conf);
+                       if (nc && net_conf_to_skb(skb, nc, 1) != 0)
+                               goto cancel;
+                       goto done;
+               }
  
-       return (int)((char *)tl - (char *)reply->tag_list);
+               D_ASSERT(mdev->vnr == volume);
+               D_ASSERT(mdev->tconn == tconn);
+               dh->minor = mdev_to_minor(mdev);
+               dh->ret_code = NO_ERROR;
+               if (nla_put_status_info(skb, mdev, NULL)) {
+ cancel:
+                       genlmsg_cancel(skb, dh);
+                       goto out;
+               }
+ done:
+               genlmsg_end(skb, dh);
+         }
+ out:
+       rcu_read_unlock();
+       /* where to start the next iteration */
+         cb->args[0] = (long)pos;
+         cb->args[1] = (pos == tconn) ? volume + 1 : 0;
+       /* No more tconns/volumes/minors found results in an empty skb.
+        * Which will terminate the dump. */
+         return skb->len;
  }
  
- /**
-  * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use
-  * @mdev:     DRBD device.
-  * @nlp:      Netlink/connector packet from drbdsetup
-  * @reply:    Reply packet for drbdsetup
+ /*
+  * Request status of all resources, or of all volumes within a single resource.
+  *
+  * This is a dump, as the answer may not fit in a single reply skb otherwise.
+  * Which means we cannot use the family->attrbuf or other such members, because
+  * dump is NOT protected by the genl_lock().  During dump, we only have access
+  * to the incoming skb, and need to opencode "parsing" of the nlattr payload.
+  *
+  * Once things are setup properly, we call into get_one_status().
   */
- static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                                   struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb)
  {
-       unsigned short *tl;
-       char rv;
+       const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
+       struct nlattr *nla;
+       const char *resource_name;
+       struct drbd_tconn *tconn;
+       int maxtype;
+       /* Is this a followup call? */
+       if (cb->args[0]) {
+               /* ... of a single resource dump,
+                * and the resource iterator has been advanced already? */
+               if (cb->args[2] && cb->args[2] != cb->args[0])
+                       return 0; /* DONE. */
+               goto dump;
+       }
+       /* First call (from netlink_dump_start).  We need to figure out
+        * which resource(s) the user wants us to dump. */
+       nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen),
+                       nlmsg_attrlen(cb->nlh, hdrlen),
+                       DRBD_NLA_CFG_CONTEXT);
+       /* No explicit context given.  Dump all. */
+       if (!nla)
+               goto dump;
+       maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
+       nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name));
+       if (IS_ERR(nla))
+               return PTR_ERR(nla);
+       /* context given, but no name present? */
+       if (!nla)
+               return -EINVAL;
+       resource_name = nla_data(nla);
+       tconn = conn_get_by_name(resource_name);
+       if (!tconn)
+               return -ENODEV;
+       kref_put(&tconn->kref, &conn_destroy); /* get_one_status() (re)validates tconn by itself */
+       /* prime iterators, and set "filter" mode mark:
+        * only dump this tconn. */
+       cb->args[0] = (long)tconn;
+       /* cb->args[1] = 0; passed in this way. */
+       cb->args[2] = (long)tconn;
+ dump:
+       return get_one_status(skb, cb);
+ }
  
-       tl = reply->tag_list;
+ int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
+ {
+       enum drbd_ret_code retcode;
+       struct timeout_parms tp;
+       int err;
  
-       rv = mdev->state.pdsk == D_OUTDATED        ? UT_PEER_OUTDATED :
-         drbd_test_flag(mdev, USE_DEGR_WFC_T) ? UT_DEGRADED : UT_DEFAULT;
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
  
-       tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv));
-       put_unaligned(TT_END, tl++); /* Close the tag list */
+       tp.timeout_type =
+               adm_ctx.mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
+               test_bit(USE_DEGR_WFC_T, &adm_ctx.mdev->flags) ? UT_DEGRADED :
+               UT_DEFAULT;
  
-       return (int)((char *)tl - (char *)reply->tag_list);
+       err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp);
+       if (err) {
+               nlmsg_free(adm_ctx.reply_skb);
+               return err;
+       }
+ out:
+       drbd_adm_finish(info, retcode);
+       return 0;
  }
  
- static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                                   struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
  {
-       /* default to resume from last known position, if possible */
-       struct start_ov args = {
-               .start_sector = mdev->ov_start_sector,
-               .stop_sector = ULLONG_MAX,
-       };
+       struct drbd_conf *mdev;
+       enum drbd_ret_code retcode;
+       struct start_ov_parms parms;
  
-       if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) {
-               reply->ret_code = ERR_MANDATORY_TAG;
-               return 0;
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
+       mdev = adm_ctx.mdev;
+       /* resume from last known position, if possible */
+       parms.ov_start_sector = mdev->ov_start_sector;
+       parms.ov_stop_sector = ULLONG_MAX;
+       if (info->attrs[DRBD_NLA_START_OV_PARMS]) {
+               int err = start_ov_parms_from_attrs(&parms, info);
+               if (err) {
+                       retcode = ERR_MANDATORY_TAG;
+                       drbd_msg_put_info(from_attrs_err_to_txt(err));
+                       goto out;
+               }
        }
+       /* w_make_ov_request expects position to be aligned */
+       mdev->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
+       mdev->ov_stop_sector = parms.ov_stop_sector;
  
        /* If there is still bitmap IO pending, e.g. previous resync or verify
         * just being finished, wait for it before requesting a new resync. */
        drbd_suspend_io(mdev);
-       wait_event(mdev->misc_wait, !drbd_test_flag(mdev, BITMAP_IO));
-       /* w_make_ov_request expects start position to be aligned */
-       mdev->ov_start_sector = args.start_sector & ~(BM_SECT_PER_BIT-1);
-       mdev->ov_stop_sector = args.stop_sector;
-       reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
+       wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
+       retcode = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
        drbd_resume_io(mdev);
+ out:
+       drbd_adm_finish(info, retcode);
        return 0;
  }
  
  
- static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                             struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
  {
-       int retcode = NO_ERROR;
+       struct drbd_conf *mdev;
+       enum drbd_ret_code retcode;
        int skip_initial_sync = 0;
        int err;
+       struct new_c_uuid_parms args;
  
-       struct new_c_uuid args;
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out_nolock;
  
-       memset(&args, 0, sizeof(struct new_c_uuid));
-       if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) {
-               reply->ret_code = ERR_MANDATORY_TAG;
-               return 0;
+       mdev = adm_ctx.mdev;
+       memset(&args, 0, sizeof(args));
+       if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) {
+               err = new_c_uuid_parms_from_attrs(&args, info);
+               if (err) {
+                       retcode = ERR_MANDATORY_TAG;
+                       drbd_msg_put_info(from_attrs_err_to_txt(err));
+                       goto out_nolock;
+               }
        }
  
-       mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */
+       mutex_lock(mdev->state_mutex); /* Protects us against serialized state changes. */
  
        if (!get_ldev(mdev)) {
                retcode = ERR_NO_DISK;
        }
  
        /* this is "skip initial sync", assume to be clean */
-       if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 &&
+       if (mdev->state.conn == C_CONNECTED && mdev->tconn->agreed_pro_version >= 90 &&
            mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
                dev_info(DEV, "Preparing to skip initial sync\n");
                skip_initial_sync = 1;
                        drbd_send_uuids_skip_initial_sync(mdev);
                        _drbd_uuid_set(mdev, UI_BITMAP, 0);
                        drbd_print_uuids(mdev, "cleared bitmap UUID");
-                       spin_lock_irq(&mdev->req_lock);
+                       spin_lock_irq(&mdev->tconn->req_lock);
                        _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
                                        CS_VERBOSE, NULL);
-                       spin_unlock_irq(&mdev->req_lock);
+                       spin_unlock_irq(&mdev->tconn->req_lock);
                }
        }
  
  out_dec:
        put_ldev(mdev);
  out:
-       mutex_unlock(&mdev->state_mutex);
-       reply->ret_code = retcode;
+       mutex_unlock(mdev->state_mutex);
+ out_nolock:
+       drbd_adm_finish(info, retcode);
        return 0;
  }
  
- struct cn_handler_struct {
-       int (*function)(struct drbd_conf *,
-                        struct drbd_nl_cfg_req *,
-                        struct drbd_nl_cfg_reply *);
-       int reply_body_size;
- };
- static struct cn_handler_struct cnd_table[] = {
-       [ P_primary ]           = { &drbd_nl_primary,           0 },
-       [ P_secondary ]         = { &drbd_nl_secondary,         0 },
-       [ P_disk_conf ]         = { &drbd_nl_disk_conf,         0 },
-       [ P_detach ]            = { &drbd_nl_detach,            0 },
-       [ P_net_conf ]          = { &drbd_nl_net_conf,          0 },
-       [ P_disconnect ]        = { &drbd_nl_disconnect,        0 },
-       [ P_resize ]            = { &drbd_nl_resize,            0 },
-       [ P_syncer_conf ]       = { &drbd_nl_syncer_conf,       0 },
-       [ P_invalidate ]        = { &drbd_nl_invalidate,        0 },
-       [ P_invalidate_peer ]   = { &drbd_nl_invalidate_peer,   0 },
-       [ P_pause_sync ]        = { &drbd_nl_pause_sync,        0 },
-       [ P_resume_sync ]       = { &drbd_nl_resume_sync,       0 },
-       [ P_suspend_io ]        = { &drbd_nl_suspend_io,        0 },
-       [ P_resume_io ]         = { &drbd_nl_resume_io,         0 },
-       [ P_outdate ]           = { &drbd_nl_outdate,           0 },
-       [ P_get_config ]        = { &drbd_nl_get_config,
-                                   sizeof(struct syncer_conf_tag_len_struct) +
-                                   sizeof(struct disk_conf_tag_len_struct) +
-                                   sizeof(struct net_conf_tag_len_struct) },
-       [ P_get_state ]         = { &drbd_nl_get_state,
-                                   sizeof(struct get_state_tag_len_struct) +
-                                   sizeof(struct sync_progress_tag_len_struct) },
-       [ P_get_uuids ]         = { &drbd_nl_get_uuids,
-                                   sizeof(struct get_uuids_tag_len_struct) },
-       [ P_get_timeout_flag ]  = { &drbd_nl_get_timeout_flag,
-                                   sizeof(struct get_timeout_flag_tag_len_struct)},
-       [ P_start_ov ]          = { &drbd_nl_start_ov,          0 },
-       [ P_new_c_uuid ]        = { &drbd_nl_new_c_uuid,        0 },
- };
- static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp)
+ static enum drbd_ret_code
+ drbd_check_resource_name(const char *name)
  {
-       struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data;
-       struct cn_handler_struct *cm;
-       struct cn_msg *cn_reply;
-       struct drbd_nl_cfg_reply *reply;
-       struct drbd_conf *mdev;
-       int retcode, rr;
-       int reply_size = sizeof(struct cn_msg)
-               + sizeof(struct drbd_nl_cfg_reply)
-               + sizeof(short int);
-       if (!try_module_get(THIS_MODULE)) {
-               printk(KERN_ERR "drbd: try_module_get() failed!\n");
-               return;
+       if (!name || !name[0]) {
+               drbd_msg_put_info("resource name missing");
+               return ERR_MANDATORY_TAG;
        }
-       if (!capable(CAP_SYS_ADMIN)) {
-               retcode = ERR_PERM;
-               goto fail;
-       }
-       mdev = ensure_mdev(nlp->drbd_minor,
-                       (nlp->flags & DRBD_NL_CREATE_DEVICE));
-       if (!mdev) {
-               retcode = ERR_MINOR_INVALID;
-               goto fail;
+       /* if we want to use these in sysfs/configfs/debugfs some day,
+        * we must not allow slashes */
+       if (strchr(name, '/')) {
+               drbd_msg_put_info("invalid resource name");
+               return ERR_INVALID_REQUEST;
        }
+       return NO_ERROR;
+ }
  
-       if (nlp->packet_type >= P_nl_after_last_packet ||
          nlp->packet_type == P_return_code_only) {
-               retcode = ERR_PACKET_NR;
-               goto fail;
-       }
+ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
+ {
+       enum drbd_ret_code retcode;
+       struct res_opts res_opts;
+       int err;
  
-       cm = cnd_table + nlp->packet_type;
+       retcode = drbd_adm_prepare(skb, info, 0);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
  
-       /* This may happen if packet number is 0: */
-       if (cm->function == NULL) {
-               retcode = ERR_PACKET_NR;
-               goto fail;
+       set_res_opts_defaults(&res_opts);
+       err = res_opts_from_attrs(&res_opts, info);
+       if (err && err != -ENOMSG) {
+               retcode = ERR_MANDATORY_TAG;
+               drbd_msg_put_info(from_attrs_err_to_txt(err));
+               goto out;
        }
  
-       reply_size += cm->reply_body_size;
+       retcode = drbd_check_resource_name(adm_ctx.resource_name);
+       if (retcode != NO_ERROR)
+               goto out;
  
-       /* allocation not in the IO path, cqueue thread context */
-       cn_reply = kzalloc(reply_size, GFP_KERNEL);
-       if (!cn_reply) {
-               retcode = ERR_NOMEM;
-               goto fail;
+       if (adm_ctx.tconn) {
+               if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) {
+                       retcode = ERR_INVALID_REQUEST;
+                       drbd_msg_put_info("resource exists");
+               }
+               /* else: still NO_ERROR */
+               goto out;
        }
-       reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
-       reply->packet_type =
-               cm->reply_body_size ? nlp->packet_type : P_return_code_only;
-       reply->minor = nlp->drbd_minor;
-       reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
-       /* reply->tag_list; might be modified by cm->function. */
-       rr = cm->function(mdev, nlp, reply);
  
-       cn_reply->id = req->id;
-       cn_reply->seq = req->seq;
-       cn_reply->ack = req->ack  + 1;
-       cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr;
-       cn_reply->flags = 0;
-       rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL);
-       if (rr && rr != -ESRCH)
-               printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
-       kfree(cn_reply);
-       module_put(THIS_MODULE);
-       return;
-  fail:
-       drbd_nl_send_reply(req, retcode);
-       module_put(THIS_MODULE);
+       if (!conn_create(adm_ctx.resource_name, &res_opts))
+               retcode = ERR_NOMEM;
+ out:
+       drbd_adm_finish(info, retcode);
+       return 0;
  }
  
- static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */
- static unsigned short *
- __tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data,
-       unsigned short len, int nul_terminated)
+ int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info)
  {
-       unsigned short l = tag_descriptions[tag_number(tag)].max_len;
-       len = (len < l) ? len :  l;
-       put_unaligned(tag, tl++);
-       put_unaligned(len, tl++);
-       memcpy(tl, data, len);
-       tl = (unsigned short*)((char*)tl + len);
-       if (nul_terminated)
-               *((char*)tl - 1) = 0;
-       return tl;
- }
+       struct drbd_genlmsghdr *dh = info->userhdr;
+       enum drbd_ret_code retcode;
  
- static unsigned short *
tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len)
- {
-       return __tl_add_blob(tl, tag, data, len, 0);
- }
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
      if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
  
- static unsigned short *
- tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str)
- {
-       return __tl_add_blob(tl, tag, str, strlen(str)+1, 0);
- }
+       if (dh->minor > MINORMASK) {
+               drbd_msg_put_info("requested minor out of range");
+               retcode = ERR_INVALID_REQUEST;
+               goto out;
+       }
+       if (adm_ctx.volume > DRBD_VOLUME_MAX) {
+               drbd_msg_put_info("requested volume id out of range");
+               retcode = ERR_INVALID_REQUEST;
+               goto out;
+       }
  
- static unsigned short *
- tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val)
- {
-       put_unaligned(tag, tl++);
-       switch(tag_type(tag)) {
-       case TT_INTEGER:
-               put_unaligned(sizeof(int), tl++);
-               put_unaligned(*(int *)val, (int *)tl);
-               tl = (unsigned short*)((char*)tl+sizeof(int));
-               break;
-       case TT_INT64:
-               put_unaligned(sizeof(u64), tl++);
-               put_unaligned(*(u64 *)val, (u64 *)tl);
-               tl = (unsigned short*)((char*)tl+sizeof(u64));
-               break;
-       default:
-               /* someone did something stupid. */
-               ;
+       /* drbd_adm_prepare made sure already
+        * that mdev->tconn and mdev->vnr match the request. */
+       if (adm_ctx.mdev) {
+               if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
+                       retcode = ERR_MINOR_EXISTS;
+               /* else: still NO_ERROR */
+               goto out;
        }
-       return tl;
+       retcode = conn_new_minor(adm_ctx.tconn, dh->minor, adm_ctx.volume);
+ out:
+       drbd_adm_finish(info, retcode);
+       return 0;
  }
  
void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state)
static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev)
  {
-       char buffer[sizeof(struct cn_msg)+
-                   sizeof(struct drbd_nl_cfg_reply)+
-                   sizeof(struct get_state_tag_len_struct)+
-                   sizeof(short int)];
-       struct cn_msg *cn_reply = (struct cn_msg *) buffer;
-       struct drbd_nl_cfg_reply *reply =
-               (struct drbd_nl_cfg_reply *)cn_reply->data;
-       unsigned short *tl = reply->tag_list;
-       /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
-       tl = get_state_to_tags(mdev, (struct get_state *)&state, tl);
-       put_unaligned(TT_END, tl++); /* Close the tag list */
-       cn_reply->id.idx = CN_IDX_DRBD;
-       cn_reply->id.val = CN_VAL_DRBD;
-       cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
-       cn_reply->ack = 0; /* not used here. */
-       cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
-               (int)((char *)tl - (char *)reply->tag_list);
-       cn_reply->flags = 0;
-       reply->packet_type = P_get_state;
-       reply->minor = mdev_to_minor(mdev);
-       reply->ret_code = NO_ERROR;
-       cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+       if (mdev->state.disk == D_DISKLESS &&
+           /* no need to be mdev->state.conn == C_STANDALONE &&
+            * we may want to delete a minor from a live replication group.
+            */
+           mdev->state.role == R_SECONDARY) {
+               _drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS),
+                                   CS_VERBOSE + CS_WAIT_COMPLETE);
+               idr_remove(&mdev->tconn->volumes, mdev->vnr);
+               idr_remove(&minors, mdev_to_minor(mdev));
+               del_gendisk(mdev->vdisk);
+               synchronize_rcu();
+               kref_put(&mdev->kref, &drbd_minor_destroy);
+               return NO_ERROR;
+       } else
+               return ERR_MINOR_CONFIGURED;
  }
  
void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name)
int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info)
  {
-       char buffer[sizeof(struct cn_msg)+
-                   sizeof(struct drbd_nl_cfg_reply)+
-                   sizeof(struct call_helper_tag_len_struct)+
-                   sizeof(short int)];
-       struct cn_msg *cn_reply = (struct cn_msg *) buffer;
-       struct drbd_nl_cfg_reply *reply =
-               (struct drbd_nl_cfg_reply *)cn_reply->data;
-       unsigned short *tl = reply->tag_list;
-       /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
-       tl = tl_add_str(tl, T_helper, helper_name);
-       put_unaligned(TT_END, tl++); /* Close the tag list */
-       cn_reply->id.idx = CN_IDX_DRBD;
-       cn_reply->id.val = CN_VAL_DRBD;
-       cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
-       cn_reply->ack = 0; /* not used here. */
-       cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
-               (int)((char *)tl - (char *)reply->tag_list);
-       cn_reply->flags = 0;
+       enum drbd_ret_code retcode;
  
-       reply->packet_type = P_call_helper;
-       reply->minor = mdev_to_minor(mdev);
-       reply->ret_code = NO_ERROR;
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
  
-       cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+       retcode = adm_delete_minor(adm_ctx.mdev);
+ out:
+       drbd_adm_finish(info, retcode);
+       return 0;
  }
  
- void drbd_bcast_ee(struct drbd_conf *mdev,
-               const char *reason, const int dgs,
-               const char* seen_hash, const char* calc_hash,
-               const struct drbd_epoch_entry* e)
+ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
  {
-       struct cn_msg *cn_reply;
-       struct drbd_nl_cfg_reply *reply;
-       unsigned short *tl;
-       struct page *page;
-       unsigned len;
+       int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+       struct drbd_conf *mdev;
+       unsigned i;
  
-       if (!e)
-               return;
-       if (!reason || !reason[0])
-               return;
+       retcode = drbd_adm_prepare(skb, info, 0);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
  
-       /* apparently we have to memcpy twice, first to prepare the data for the
-        * struct cn_msg, then within cn_netlink_send from the cn_msg to the
-        * netlink skb. */
-       /* receiver thread context, which is not in the writeout path (of this node),
-        * but may be in the writeout path of the _other_ node.
-        * GFP_NOIO to avoid potential "distributed deadlock". */
-       cn_reply = kzalloc(
-               sizeof(struct cn_msg)+
-               sizeof(struct drbd_nl_cfg_reply)+
-               sizeof(struct dump_ee_tag_len_struct)+
-               sizeof(short int),
-               GFP_NOIO);
-       if (!cn_reply) {
-               dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n",
-                               (unsigned long long)e->sector, e->size);
-               return;
+       if (!adm_ctx.tconn) {
+               retcode = ERR_RES_NOT_KNOWN;
+               goto out;
        }
  
-       reply = (struct drbd_nl_cfg_reply*)cn_reply->data;
-       tl = reply->tag_list;
-       tl = tl_add_str(tl, T_dump_ee_reason, reason);
-       tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs);
-       tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs);
-       tl = tl_add_int(tl, T_ee_sector, &e->sector);
-       tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
-       /* dump the first 32k */
-       len = min_t(unsigned, e->size, 32 << 10);
-       put_unaligned(T_ee_data, tl++);
-       put_unaligned(len, tl++);
-       page = e->pages;
-       page_chain_for_each(page) {
-               void *d = kmap_atomic(page);
-               unsigned l = min_t(unsigned, len, PAGE_SIZE);
-               memcpy(tl, d, l);
-               kunmap_atomic(d);
-               tl = (unsigned short*)((char*)tl + l);
-               len -= l;
-               if (len == 0)
-                       break;
+       /* demote */
+       idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) {
+               retcode = drbd_set_role(mdev, R_SECONDARY, 0);
+               if (retcode < SS_SUCCESS) {
+                       drbd_msg_put_info("failed to demote");
+                       goto out;
+               }
        }
-       put_unaligned(TT_END, tl++); /* Close the tag list */
-       cn_reply->id.idx = CN_IDX_DRBD;
-       cn_reply->id.val = CN_VAL_DRBD;
-       cn_reply->seq = atomic_add_return(1,&drbd_nl_seq);
-       cn_reply->ack = 0; // not used here.
-       cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
-               (int)((char*)tl - (char*)reply->tag_list);
-       cn_reply->flags = 0;
-       reply->packet_type = P_dump_ee;
-       reply->minor = mdev_to_minor(mdev);
-       reply->ret_code = NO_ERROR;
  
-       cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
-       kfree(cn_reply);
- }
- void drbd_bcast_sync_progress(struct drbd_conf *mdev)
- {
-       char buffer[sizeof(struct cn_msg)+
-                   sizeof(struct drbd_nl_cfg_reply)+
-                   sizeof(struct sync_progress_tag_len_struct)+
-                   sizeof(short int)];
-       struct cn_msg *cn_reply = (struct cn_msg *) buffer;
-       struct drbd_nl_cfg_reply *reply =
-               (struct drbd_nl_cfg_reply *)cn_reply->data;
-       unsigned short *tl = reply->tag_list;
-       unsigned long rs_left;
-       unsigned int res;
+       retcode = conn_try_disconnect(adm_ctx.tconn, 0);
+       if (retcode < SS_SUCCESS) {
+               drbd_msg_put_info("failed to disconnect");
+               goto out;
+       }
  
-       /* no local ref, no bitmap, no syncer progress, no broadcast. */
-       if (!get_ldev(mdev))
-               return;
-       drbd_get_syncer_progress(mdev, &rs_left, &res);
-       put_ldev(mdev);
+       /* detach */
+       idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) {
+               retcode = adm_detach(mdev, 0);
+               if (retcode < SS_SUCCESS || retcode > NO_ERROR) {
+                       drbd_msg_put_info("failed to detach");
+                       goto out;
+               }
+       }
  
-       tl = tl_add_int(tl, T_sync_progress, &res);
-       put_unaligned(TT_END, tl++); /* Close the tag list */
+       /* If we reach this, all volumes (of this tconn) are Secondary,
+        * Disconnected, Diskless, aka Unconfigured. Make sure all threads have
+        * actually stopped, state handling only does drbd_thread_stop_nowait(). */
+       drbd_thread_stop(&adm_ctx.tconn->worker);
  
-       cn_reply->id.idx = CN_IDX_DRBD;
-       cn_reply->id.val = CN_VAL_DRBD;
+       /* Now, nothing can fail anymore */
  
-       cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
-       cn_reply->ack = 0; /* not used here. */
-       cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
-               (int)((char *)tl - (char *)reply->tag_list);
-       cn_reply->flags = 0;
+       /* delete volumes */
+       idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) {
+               retcode = adm_delete_minor(mdev);
+               if (retcode != NO_ERROR) {
+                       /* "can not happen" */
+                       drbd_msg_put_info("failed to delete volume");
+                       goto out;
+               }
+       }
  
-       reply->packet_type = P_sync_progress;
-       reply->minor = mdev_to_minor(mdev);
-       reply->ret_code = NO_ERROR;
+       /* delete connection */
+       if (conn_lowest_minor(adm_ctx.tconn) < 0) {
+               list_del_rcu(&adm_ctx.tconn->all_tconn);
+               synchronize_rcu();
+               kref_put(&adm_ctx.tconn->kref, &conn_destroy);
  
-       cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+               retcode = NO_ERROR;
+       } else {
+               /* "can not happen" */
+               retcode = ERR_RES_IN_USE;
+               drbd_msg_put_info("failed to delete connection");
+       }
+       goto out;
+ out:
+       drbd_adm_finish(info, retcode);
+       return 0;
  }
  
- int __init drbd_nl_init(void)
+ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
  {
-       static struct cb_id cn_id_drbd;
-       int err, try=10;
+       enum drbd_ret_code retcode;
  
-       cn_id_drbd.val = CN_VAL_DRBD;
-       do {
-               cn_id_drbd.idx = cn_idx;
-               err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback);
-               if (!err)
-                       break;
-               cn_idx = (cn_idx + CN_IDX_STEP);
-       } while (try--);
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
  
-       if (err) {
-               printk(KERN_ERR "drbd: cn_drbd failed to register\n");
-               return err;
+       if (conn_lowest_minor(adm_ctx.tconn) < 0) {
+               list_del_rcu(&adm_ctx.tconn->all_tconn);
+               synchronize_rcu();
+               kref_put(&adm_ctx.tconn->kref, &conn_destroy);
+               retcode = NO_ERROR;
+       } else {
+               retcode = ERR_RES_IN_USE;
        }
  
+       if (retcode == NO_ERROR)
+               drbd_thread_stop(&adm_ctx.tconn->worker);
+ out:
+       drbd_adm_finish(info, retcode);
        return 0;
  }
  
- void drbd_nl_cleanup(void)
- {
-       static struct cb_id cn_id_drbd;
-       cn_id_drbd.idx = cn_idx;
-       cn_id_drbd.val = CN_VAL_DRBD;
-       cn_del_callback(&cn_id_drbd);
- }
- void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
+ void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib)
  {
-       char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)];
-       struct cn_msg *cn_reply = (struct cn_msg *) buffer;
-       struct drbd_nl_cfg_reply *reply =
-               (struct drbd_nl_cfg_reply *)cn_reply->data;
-       int rr;
-       memset(buffer, 0, sizeof(buffer));
-       cn_reply->id = req->id;
+       static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
+       struct sk_buff *msg;
+       struct drbd_genlmsghdr *d_out;
+       unsigned seq;
+       int err = -ENOMEM;
+       if (sib->sib_reason == SIB_SYNC_PROGRESS &&
+           time_after(jiffies, mdev->rs_last_bcast + HZ))
+               mdev->rs_last_bcast = jiffies;
+       else
+               return;
  
-       cn_reply->seq = req->seq;
-       cn_reply->ack = req->ack  + 1;
-       cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
-       cn_reply->flags = 0;
+       seq = atomic_inc_return(&drbd_genl_seq);
+       msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+       if (!msg)
+               goto failed;
+       err = -EMSGSIZE;
+       d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT);
+       if (!d_out) /* cannot happen, but anyways. */
+               goto nla_put_failure;
+       d_out->minor = mdev_to_minor(mdev);
+       d_out->ret_code = NO_ERROR;
+       if (nla_put_status_info(msg, mdev, sib))
+               goto nla_put_failure;
+       genlmsg_end(msg, d_out);
+       err = drbd_genl_multicast_events(msg, 0);
+       /* msg has been consumed or freed in netlink_broadcast() */
+       if (err && err != -ESRCH)
+               goto failed;
  
-       reply->packet_type = P_return_code_only;
-       reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
-       reply->ret_code = ret_code;
+       return;
  
-       rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
-       if (rr && rr != -ESRCH)
-               printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
+ nla_put_failure:
+       nlmsg_free(msg);
+ failed:
+       dev_err(DEV, "Error %d while broadcasting event. "
+                       "Event seq:%u sib_reason:%u\n",
+                       err, seq, sib->sib_reason);
  }
index eb0cafea142323794e0f6a9bfee167624fc36d05,813759f1b6adada6c520a5eeda9056c4396f3aa9..0331ad0b61e18583fb8f5fc5b768134eba0fd6ad
  
  #include "drbd_vli.h"
  
+ struct packet_info {
+       enum drbd_packet cmd;
+       unsigned int size;
+       unsigned int vnr;
+       void *data;
+ };
  enum finish_epoch {
        FE_STILL_LIVE,
        FE_DESTROYED,
        FE_RECYCLED,
  };
  
- static int drbd_do_handshake(struct drbd_conf *mdev);
- static int drbd_do_auth(struct drbd_conf *mdev);
+ static int drbd_do_features(struct drbd_tconn *tconn);
+ static int drbd_do_auth(struct drbd_tconn *tconn);
+ static int drbd_disconnected(struct drbd_conf *mdev);
  
- static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
- static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
+ static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *, struct drbd_epoch *, enum epoch_event);
+ static int e_end_block(struct drbd_work *, int);
  
  
  #define GFP_TRY       (__GFP_HIGHMEM | __GFP_NOWARN)
@@@ -142,11 -150,12 +150,12 @@@ static void page_chain_add(struct page 
        *head = chain_first;
  }
  
- static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
+ static struct page *__drbd_alloc_pages(struct drbd_conf *mdev,
+                                      unsigned int number)
  {
        struct page *page = NULL;
        struct page *tmp = NULL;
-       int i = 0;
+       unsigned int i = 0;
  
        /* Yes, testing drbd_pp_vacant outside the lock is racy.
         * So what. It saves a spin_lock. */
                return page;
  
        /* Not enough pages immediately available this time.
-        * No need to jump around here, drbd_pp_alloc will retry this
+        * No need to jump around here, drbd_alloc_pages will retry this
         * function "soon". */
        if (page) {
                tmp = page_chain_tail(page, NULL);
        return NULL;
  }
  
- static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
+ static void reclaim_finished_net_peer_reqs(struct drbd_conf *mdev,
+                                          struct list_head *to_be_freed)
  {
-       struct drbd_epoch_entry *e;
+       struct drbd_peer_request *peer_req;
        struct list_head *le, *tle;
  
        /* The EEs are always appended to the end of the list. Since
           stop to examine the list... */
  
        list_for_each_safe(le, tle, &mdev->net_ee) {
-               e = list_entry(le, struct drbd_epoch_entry, w.list);
-               if (drbd_ee_has_active_page(e))
+               peer_req = list_entry(le, struct drbd_peer_request, w.list);
+               if (drbd_peer_req_has_active_page(peer_req))
                        break;
                list_move(le, to_be_freed);
        }
  static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
  {
        LIST_HEAD(reclaimed);
-       struct drbd_epoch_entry *e, *t;
+       struct drbd_peer_request *peer_req, *t;
  
-       spin_lock_irq(&mdev->req_lock);
-       reclaim_net_ee(mdev, &reclaimed);
-       spin_unlock_irq(&mdev->req_lock);
+       spin_lock_irq(&mdev->tconn->req_lock);
+       reclaim_finished_net_peer_reqs(mdev, &reclaimed);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
-       list_for_each_entry_safe(e, t, &reclaimed, w.list)
-               drbd_free_net_ee(mdev, e);
+       list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+               drbd_free_net_peer_req(mdev, peer_req);
  }
  
  /**
-  * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
+  * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
   * @mdev:     DRBD device.
   * @number:   number of pages requested
   * @retry:    whether to retry, if not enough pages are available right now
   *
   * Returns a page chain linked via page->private.
   */
- static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
+ struct page *drbd_alloc_pages(struct drbd_conf *mdev, unsigned int number,
+                             bool retry)
  {
        struct page *page = NULL;
+       struct net_conf *nc;
        DEFINE_WAIT(wait);
+       int mxb;
  
        /* Yes, we may run up to @number over max_buffers. If we
         * follow it strictly, the admin will get it wrong anyways. */
-       if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers)
-               page = drbd_pp_first_pages_or_try_alloc(mdev, number);
+       rcu_read_lock();
+       nc = rcu_dereference(mdev->tconn->net_conf);
+       mxb = nc ? nc->max_buffers : 1000000;
+       rcu_read_unlock();
+       if (atomic_read(&mdev->pp_in_use) < mxb)
+               page = __drbd_alloc_pages(mdev, number);
  
        while (page == NULL) {
                prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
  
                drbd_kick_lo_and_reclaim_net(mdev);
  
-               if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
-                       page = drbd_pp_first_pages_or_try_alloc(mdev, number);
+               if (atomic_read(&mdev->pp_in_use) < mxb) {
+                       page = __drbd_alloc_pages(mdev, number);
                        if (page)
                                break;
                }
                        break;
  
                if (signal_pending(current)) {
-                       dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
+                       dev_warn(DEV, "drbd_alloc_pages interrupted!\n");
                        break;
                }
  
        return page;
  }
  
- /* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
-  * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
+ /* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
+  * Is also used from inside an other spin_lock_irq(&mdev->tconn->req_lock);
   * Either links the page chain back to the global pool,
   * or returns all pages to the system. */
- static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
+ static void drbd_free_pages(struct drbd_conf *mdev, struct page *page, int is_net)
  {
        atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
        int i;
        if (page == NULL)
                return;
  
-       if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count)
+       if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
                i = page_chain_free(page);
        else {
                struct page *tmp;
@@@ -302,127 -320,130 +320,130 @@@ You need to hold the req_lock
   _drbd_wait_ee_list_empty()
  
  You must not have the req_lock:
-  drbd_free_ee()
-  drbd_alloc_ee()
-  drbd_init_ee()
-  drbd_release_ee()
+  drbd_free_peer_req()
+  drbd_alloc_peer_req()
+  drbd_free_peer_reqs()
   drbd_ee_fix_bhs()
-  drbd_process_done_ee()
+  drbd_finish_peer_reqs()
   drbd_clear_done_ee()
   drbd_wait_ee_list_empty()
  */
  
- struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
-                                    u64 id,
-                                    sector_t sector,
-                                    unsigned int data_size,
-                                    gfp_t gfp_mask) __must_hold(local)
+ struct drbd_peer_request *
+ drbd_alloc_peer_req(struct drbd_conf *mdev, u64 id, sector_t sector,
+                   unsigned int data_size, gfp_t gfp_mask) __must_hold(local)
  {
-       struct drbd_epoch_entry *e;
+       struct drbd_peer_request *peer_req;
        struct page *page = NULL;
        unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
  
        if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE))
                return NULL;
  
-       e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
-       if (!e) {
+       peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+       if (!peer_req) {
                if (!(gfp_mask & __GFP_NOWARN))
-                       dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
+                       dev_err(DEV, "%s: allocation failed\n", __func__);
                return NULL;
        }
  
        if (data_size) {
-               page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
+               page = drbd_alloc_pages(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
                if (!page)
                        goto fail;
        }
  
-       INIT_HLIST_NODE(&e->collision);
-       e->epoch = NULL;
-       e->mdev = mdev;
-       e->pages = page;
-       atomic_set(&e->pending_bios, 0);
-       e->size = data_size;
-       e->flags = 0;
-       e->sector = sector;
-       e->block_id = id;
+       drbd_clear_interval(&peer_req->i);
+       peer_req->i.size = data_size;
+       peer_req->i.sector = sector;
+       peer_req->i.local = false;
+       peer_req->i.waiting = false;
+       peer_req->epoch = NULL;
+       peer_req->w.mdev = mdev;
+       peer_req->pages = page;
+       atomic_set(&peer_req->pending_bios, 0);
+       peer_req->flags = 0;
+       /*
+        * The block_id is opaque to the receiver.  It is not endianness
+        * converted, and sent back to the sender unchanged.
+        */
+       peer_req->block_id = id;
  
-       return e;
+       return peer_req;
  
   fail:
-       mempool_free(e, drbd_ee_mempool);
+       mempool_free(peer_req, drbd_ee_mempool);
        return NULL;
  }
  
- void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
+ void __drbd_free_peer_req(struct drbd_conf *mdev, struct drbd_peer_request *peer_req,
+                      int is_net)
  {
-       if (e->flags & EE_HAS_DIGEST)
-               kfree(e->digest);
-       drbd_pp_free(mdev, e->pages, is_net);
-       D_ASSERT(atomic_read(&e->pending_bios) == 0);
-       D_ASSERT(hlist_unhashed(&e->collision));
-       mempool_free(e, drbd_ee_mempool);
+       if (peer_req->flags & EE_HAS_DIGEST)
+               kfree(peer_req->digest);
+       drbd_free_pages(mdev, peer_req->pages, is_net);
+       D_ASSERT(atomic_read(&peer_req->pending_bios) == 0);
+       D_ASSERT(drbd_interval_empty(&peer_req->i));
+       mempool_free(peer_req, drbd_ee_mempool);
  }
  
- int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
+ int drbd_free_peer_reqs(struct drbd_conf *mdev, struct list_head *list)
  {
        LIST_HEAD(work_list);
-       struct drbd_epoch_entry *e, *t;
+       struct drbd_peer_request *peer_req, *t;
        int count = 0;
        int is_net = list == &mdev->net_ee;
  
-       spin_lock_irq(&mdev->req_lock);
+       spin_lock_irq(&mdev->tconn->req_lock);
        list_splice_init(list, &work_list);
-       spin_unlock_irq(&mdev->req_lock);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
-       list_for_each_entry_safe(e, t, &work_list, w.list) {
-               drbd_free_some_ee(mdev, e, is_net);
+       list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+               __drbd_free_peer_req(mdev, peer_req, is_net);
                count++;
        }
        return count;
  }
  
  /*
-  * This function is called from _asender only_
-  * but see also comments in _req_mod(,barrier_acked)
-  * and receive_Barrier.
-  *
-  * Move entries from net_ee to done_ee, if ready.
-  * Grab done_ee, call all callbacks, free the entries.
-  * The callbacks typically send out ACKs.
+  * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
   */
- static int drbd_process_done_ee(struct drbd_conf *mdev)
+ static int drbd_finish_peer_reqs(struct drbd_conf *mdev)
  {
        LIST_HEAD(work_list);
        LIST_HEAD(reclaimed);
-       struct drbd_epoch_entry *e, *t;
-       int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
+       struct drbd_peer_request *peer_req, *t;
+       int err = 0;
  
-       spin_lock_irq(&mdev->req_lock);
-       reclaim_net_ee(mdev, &reclaimed);
+       spin_lock_irq(&mdev->tconn->req_lock);
+       reclaim_finished_net_peer_reqs(mdev, &reclaimed);
        list_splice_init(&mdev->done_ee, &work_list);
-       spin_unlock_irq(&mdev->req_lock);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
-       list_for_each_entry_safe(e, t, &reclaimed, w.list)
-               drbd_free_net_ee(mdev, e);
+       list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+               drbd_free_net_peer_req(mdev, peer_req);
  
        /* possible callbacks here:
-        * e_end_block, and e_end_resync_block, e_send_discard_ack.
+        * e_end_block, and e_end_resync_block, e_send_superseded.
         * all ignore the last argument.
         */
-       list_for_each_entry_safe(e, t, &work_list, w.list) {
+       list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+               int err2;
                /* list_del not necessary, next/prev members not touched */
-               ok = e->w.cb(mdev, &e->w, !ok) && ok;
-               drbd_free_ee(mdev, e);
+               err2 = peer_req->w.cb(&peer_req->w, !!err);
+               if (!err)
+                       err = err2;
+               drbd_free_peer_req(mdev, peer_req);
        }
        wake_up(&mdev->ee_wait);
  
-       return ok;
+       return err;
  }
  
- void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+ static void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
+                                    struct list_head *head)
  {
        DEFINE_WAIT(wait);
  
         * and calling prepare_to_wait in the fast path */
        while (!list_empty(head)) {
                prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
-               spin_unlock_irq(&mdev->req_lock);
+               spin_unlock_irq(&mdev->tconn->req_lock);
                io_schedule();
                finish_wait(&mdev->ee_wait, &wait);
-               spin_lock_irq(&mdev->req_lock);
+               spin_lock_irq(&mdev->tconn->req_lock);
        }
  }
  
- void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+ static void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
+                                   struct list_head *head)
  {
-       spin_lock_irq(&mdev->req_lock);
+       spin_lock_irq(&mdev->tconn->req_lock);
        _drbd_wait_ee_list_empty(mdev, head);
-       spin_unlock_irq(&mdev->req_lock);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  }
  
- /* see also kernel_accept; which is only present since 2.6.18.
-  * also we want to log which part of it failed, exactly */
- static int drbd_accept(struct drbd_conf *mdev, const char **what,
-               struct socket *sock, struct socket **newsock)
- {
-       struct sock *sk = sock->sk;
-       int err = 0;
-       *what = "listen";
-       err = sock->ops->listen(sock, 5);
-       if (err < 0)
-               goto out;
-       *what = "sock_create_lite";
-       err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
-                              newsock);
-       if (err < 0)
-               goto out;
-       *what = "accept";
-       err = sock->ops->accept(sock, *newsock, 0);
-       if (err < 0) {
-               sock_release(*newsock);
-               *newsock = NULL;
-               goto out;
-       }
-       (*newsock)->ops  = sock->ops;
-       __module_get((*newsock)->ops->owner);
- out:
-       return err;
- }
- static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
-                   void *buf, size_t size, int flags)
+ static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
  {
        mm_segment_t oldfs;
        struct kvec iov = {
        return rv;
  }
  
- static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
+ static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size)
  {
-       mm_segment_t oldfs;
-       struct kvec iov = {
-               .iov_base = buf,
-               .iov_len = size,
-       };
-       struct msghdr msg = {
-               .msg_iovlen = 1,
-               .msg_iov = (struct iovec *)&iov,
-               .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
-       };
        int rv;
  
-       oldfs = get_fs();
-       set_fs(KERNEL_DS);
-       rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
-       set_fs(oldfs);
+       rv = drbd_recv_short(tconn->data.socket, buf, size, 0);
  
        if (rv < 0) {
                if (rv == -ECONNRESET)
-                       dev_info(DEV, "sock was reset by peer\n");
+                       conn_info(tconn, "sock was reset by peer\n");
                else if (rv != -ERESTARTSYS)
-                       dev_err(DEV, "sock_recvmsg returned %d\n", rv);
+                       conn_err(tconn, "sock_recvmsg returned %d\n", rv);
        } else if (rv == 0) {
-               if (drbd_test_flag(mdev, DISCONNECT_SENT)) {
-                       long t; /* time_left */
-                       t = wait_event_timeout(mdev->state_wait, mdev->state.conn < C_CONNECTED,
-                                              mdev->net_conf->ping_timeo * HZ/10);
+               if (test_bit(DISCONNECT_SENT, &tconn->flags)) {
+                       long t;
+                       rcu_read_lock();
+                       t = rcu_dereference(tconn->net_conf)->ping_timeo * HZ/10;
+                       rcu_read_unlock();
+                       t = wait_event_timeout(tconn->ping_wait, tconn->cstate < C_WF_REPORT_PARAMS, t);
                        if (t)
                                goto out;
                }
-               dev_info(DEV, "sock was shut down by peer\n");
+               conn_info(tconn, "sock was shut down by peer\n");
        }
  
        if (rv != size)
-               drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+               conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
  
  out:
        return rv;
  }
  
+ static int drbd_recv_all(struct drbd_tconn *tconn, void *buf, size_t size)
+ {
+       int err;
+       err = drbd_recv(tconn, buf, size);
+       if (err != size) {
+               if (err >= 0)
+                       err = -EIO;
+       } else
+               err = 0;
+       return err;
+ }
+ static int drbd_recv_all_warn(struct drbd_tconn *tconn, void *buf, size_t size)
+ {
+       int err;
+       err = drbd_recv_all(tconn, buf, size);
+       if (err && !signal_pending(current))
+               conn_warn(tconn, "short read (expected size %d)\n", (int)size);
+       return err;
+ }
  /* quoting tcp(7):
   *   On individual connections, the socket buffer size must be set prior to the
   *   listen(2) or connect(2) calls in order to have it take effect.
@@@ -561,29 -563,50 +563,50 @@@ static void drbd_setbufsize(struct sock
        }
  }
  
- static struct socket *drbd_try_connect(struct drbd_conf *mdev)
+ static struct socket *drbd_try_connect(struct drbd_tconn *tconn)
  {
        const char *what;
        struct socket *sock;
        struct sockaddr_in6 src_in6;
-       int err;
+       struct sockaddr_in6 peer_in6;
+       struct net_conf *nc;
+       int err, peer_addr_len, my_addr_len;
+       int sndbuf_size, rcvbuf_size, connect_int;
        int disconnect_on_error = 1;
  
-       if (!get_net_conf(mdev))
+       rcu_read_lock();
+       nc = rcu_dereference(tconn->net_conf);
+       if (!nc) {
+               rcu_read_unlock();
                return NULL;
+       }
+       sndbuf_size = nc->sndbuf_size;
+       rcvbuf_size = nc->rcvbuf_size;
+       connect_int = nc->connect_int;
+       rcu_read_unlock();
+       my_addr_len = min_t(int, tconn->my_addr_len, sizeof(src_in6));
+       memcpy(&src_in6, &tconn->my_addr, my_addr_len);
+       if (((struct sockaddr *)&tconn->my_addr)->sa_family == AF_INET6)
+               src_in6.sin6_port = 0;
+       else
+               ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
+       peer_addr_len = min_t(int, tconn->peer_addr_len, sizeof(src_in6));
+       memcpy(&peer_in6, &tconn->peer_addr, peer_addr_len);
  
        what = "sock_create_kern";
-       err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
-               SOCK_STREAM, IPPROTO_TCP, &sock);
+       err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family,
+                              SOCK_STREAM, IPPROTO_TCP, &sock);
        if (err < 0) {
                sock = NULL;
                goto out;
        }
  
        sock->sk->sk_rcvtimeo =
-       sock->sk->sk_sndtimeo =  mdev->net_conf->try_connect_int*HZ;
-       drbd_setbufsize(sock, mdev->net_conf->sndbuf_size,
-                       mdev->net_conf->rcvbuf_size);
+       sock->sk->sk_sndtimeo = connect_int * HZ;
+       drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
  
         /* explicitly bind to the configured IP as source IP
        *  for the outgoing connections.
        * Make sure to use 0 as port number, so linux selects
        *  a free one dynamically.
        */
-       memcpy(&src_in6, mdev->net_conf->my_addr,
-              min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
-       if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
-               src_in6.sin6_port = 0;
-       else
-               ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
        what = "bind before connect";
-       err = sock->ops->bind(sock,
-                             (struct sockaddr *) &src_in6,
-                             mdev->net_conf->my_addr_len);
+       err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
        if (err < 0)
                goto out;
  
         * stay C_WF_CONNECTION, don't go Disconnecting! */
        disconnect_on_error = 0;
        what = "connect";
-       err = sock->ops->connect(sock,
-                                (struct sockaddr *)mdev->net_conf->peer_addr,
-                                mdev->net_conf->peer_addr_len, 0);
+       err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
  
  out:
        if (err < 0) {
                        disconnect_on_error = 0;
                        break;
                default:
-                       dev_err(DEV, "%s failed, err = %d\n", what, err);
+                       conn_err(tconn, "%s failed, err = %d\n", what, err);
                }
                if (disconnect_on_error)
-                       drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+                       conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
        }
-       put_net_conf(mdev);
        return sock;
  }
  
- static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
+ struct accept_wait_data {
+       struct drbd_tconn *tconn;
+       struct socket *s_listen;
+       struct completion door_bell;
+       void (*original_sk_state_change)(struct sock *sk);
+ };
+ static void drbd_incoming_connection(struct sock *sk)
+ {
+       struct accept_wait_data *ad = sk->sk_user_data;
+       void (*state_change)(struct sock *sk);
+       state_change = ad->original_sk_state_change;
+       if (sk->sk_state == TCP_ESTABLISHED)
+               complete(&ad->door_bell);
+       state_change(sk);
+ }
+ static int prepare_listen_socket(struct drbd_tconn *tconn, struct accept_wait_data *ad)
  {
-       int timeo, err;
-       struct socket *s_estab = NULL, *s_listen;
+       int err, sndbuf_size, rcvbuf_size, my_addr_len;
+       struct sockaddr_in6 my_addr;
+       struct socket *s_listen;
+       struct net_conf *nc;
        const char *what;
  
-       if (!get_net_conf(mdev))
-               return NULL;
+       rcu_read_lock();
+       nc = rcu_dereference(tconn->net_conf);
+       if (!nc) {
+               rcu_read_unlock();
+               return -EIO;
+       }
+       sndbuf_size = nc->sndbuf_size;
+       rcvbuf_size = nc->rcvbuf_size;
+       rcu_read_unlock();
+       my_addr_len = min_t(int, tconn->my_addr_len, sizeof(struct sockaddr_in6));
+       memcpy(&my_addr, &tconn->my_addr, my_addr_len);
  
        what = "sock_create_kern";
-       err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
-               SOCK_STREAM, IPPROTO_TCP, &s_listen);
+       err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family,
+                              SOCK_STREAM, IPPROTO_TCP, &s_listen);
        if (err) {
                s_listen = NULL;
                goto out;
        }
  
-       timeo = mdev->net_conf->try_connect_int * HZ;
-       timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
-       s_listen->sk->sk_reuse    = SK_CAN_REUSE; /* SO_REUSEADDR */
-       s_listen->sk->sk_rcvtimeo = timeo;
-       s_listen->sk->sk_sndtimeo = timeo;
-       drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
-                       mdev->net_conf->rcvbuf_size);
 -      s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */
++      s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+       drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
  
        what = "bind before listen";
-       err = s_listen->ops->bind(s_listen,
-                             (struct sockaddr *) mdev->net_conf->my_addr,
-                             mdev->net_conf->my_addr_len);
+       err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
        if (err < 0)
                goto out;
  
-       err = drbd_accept(mdev, &what, s_listen, &s_estab);
+       ad->s_listen = s_listen;
+       write_lock_bh(&s_listen->sk->sk_callback_lock);
+       ad->original_sk_state_change = s_listen->sk->sk_state_change;
+       s_listen->sk->sk_state_change = drbd_incoming_connection;
+       s_listen->sk->sk_user_data = ad;
+       write_unlock_bh(&s_listen->sk->sk_callback_lock);
  
+       what = "listen";
+       err = s_listen->ops->listen(s_listen, 5);
+       if (err < 0)
+               goto out;
+       return 0;
  out:
        if (s_listen)
                sock_release(s_listen);
        if (err < 0) {
                if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
-                       dev_err(DEV, "%s failed, err = %d\n", what, err);
-                       drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+                       conn_err(tconn, "%s failed, err = %d\n", what, err);
+                       conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
                }
        }
-       put_net_conf(mdev);
  
-       return s_estab;
+       return -EIO;
  }
  
- static int drbd_send_fp(struct drbd_conf *mdev,
-       struct socket *sock, enum drbd_packets cmd)
+ static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
  {
-       struct p_header80 *h = &mdev->data.sbuf.header.h80;
-       return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
+       write_lock_bh(&sk->sk_callback_lock);
+       sk->sk_state_change = ad->original_sk_state_change;
+       sk->sk_user_data = NULL;
+       write_unlock_bh(&sk->sk_callback_lock);
  }
  
- static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
+ static struct socket *drbd_wait_for_connect(struct drbd_tconn *tconn, struct accept_wait_data *ad)
  {
-       struct p_header80 *h = &mdev->data.rbuf.header.h80;
-       int rr;
+       int timeo, connect_int, err = 0;
+       struct socket *s_estab = NULL;
+       struct net_conf *nc;
+       rcu_read_lock();
+       nc = rcu_dereference(tconn->net_conf);
+       if (!nc) {
+               rcu_read_unlock();
+               return NULL;
+       }
+       connect_int = nc->connect_int;
+       rcu_read_unlock();
+       timeo = connect_int * HZ;
+       timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
+       err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
+       if (err <= 0)
+               return NULL;
+       err = kernel_accept(ad->s_listen, &s_estab, 0);
+       if (err < 0) {
+               if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+                       conn_err(tconn, "accept failed, err = %d\n", err);
+                       conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+               }
+       }
  
-       rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
+       if (s_estab)
+               unregister_state_change(s_estab->sk, ad);
  
-       if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
-               return be16_to_cpu(h->command);
+       return s_estab;
+ }
  
-       return 0xffff;
+ static int decode_header(struct drbd_tconn *, void *, struct packet_info *);
+ static int send_first_packet(struct drbd_tconn *tconn, struct drbd_socket *sock,
+                            enum drbd_packet cmd)
+ {
+       if (!conn_prepare_command(tconn, sock))
+               return -EIO;
+       return conn_send_command(tconn, sock, cmd, 0, NULL, 0);
+ }
+ static int receive_first_packet(struct drbd_tconn *tconn, struct socket *sock)
+ {
+       unsigned int header_size = drbd_header_size(tconn);
+       struct packet_info pi;
+       int err;
+       err = drbd_recv_short(sock, tconn->data.rbuf, header_size, 0);
+       if (err != header_size) {
+               if (err >= 0)
+                       err = -EIO;
+               return err;
+       }
+       err = decode_header(tconn, tconn->data.rbuf, &pi);
+       if (err)
+               return err;
+       return pi.cmd;
  }
  
  /**
   * drbd_socket_okay() - Free the socket if its connection is not okay
-  * @mdev:     DRBD device.
   * @sock:     pointer to the pointer to the socket.
   */
- static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
+ static int drbd_socket_okay(struct socket **sock)
  {
        int rr;
        char tb[4];
        if (!*sock)
                return false;
  
-       rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
+       rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
  
        if (rr > 0 || rr == -EAGAIN) {
                return true;
                return false;
        }
  }
+ /* Gets called if a connection is established, or if a new minor gets created
+    in a connection */
+ int drbd_connected(struct drbd_conf *mdev)
+ {
+       int err;
+       atomic_set(&mdev->packet_seq, 0);
+       mdev->peer_seq = 0;
+       mdev->state_mutex = mdev->tconn->agreed_pro_version < 100 ?
+               &mdev->tconn->cstate_mutex :
+               &mdev->own_state_mutex;
+       err = drbd_send_sync_param(mdev);
+       if (!err)
+               err = drbd_send_sizes(mdev, 0, 0);
+       if (!err)
+               err = drbd_send_uuids(mdev);
+       if (!err)
+               err = drbd_send_current_state(mdev);
+       clear_bit(USE_DEGR_WFC_T, &mdev->flags);
+       clear_bit(RESIZE_PENDING, &mdev->flags);
+       mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
+       return err;
+ }
  
  /*
   * return values:
   *     no point in trying again, please go standalone.
   *  -2 We do not have a network config...
   */
- static int drbd_connect(struct drbd_conf *mdev)
+ static int conn_connect(struct drbd_tconn *tconn)
  {
-       struct socket *s, *sock, *msock;
-       int try, h, ok;
+       struct drbd_socket sock, msock;
+       struct drbd_conf *mdev;
+       struct net_conf *nc;
+       int vnr, timeout, h, ok;
+       bool discard_my_data;
        enum drbd_state_rv rv;
+       struct accept_wait_data ad = {
+               .tconn = tconn,
+               .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
+       };
  
-       D_ASSERT(!mdev->data.socket);
-       drbd_clear_flag(mdev, DISCONNECT_SENT);
-       if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
+       clear_bit(DISCONNECT_SENT, &tconn->flags);
+       if (conn_request_state(tconn, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
                return -2;
  
-       sock  = NULL;
-       msock = NULL;
+       mutex_init(&sock.mutex);
+       sock.sbuf = tconn->data.sbuf;
+       sock.rbuf = tconn->data.rbuf;
+       sock.socket = NULL;
+       mutex_init(&msock.mutex);
+       msock.sbuf = tconn->meta.sbuf;
+       msock.rbuf = tconn->meta.rbuf;
+       msock.socket = NULL;
+       /* Assume that the peer only understands protocol 80 until we know better.  */
+       tconn->agreed_pro_version = 80;
+       if (prepare_listen_socket(tconn, &ad))
+               return 0;
  
        do {
-               for (try = 0;;) {
-                       /* 3 tries, this should take less than a second! */
-                       s = drbd_try_connect(mdev);
-                       if (s || ++try >= 3)
-                               break;
-                       /* give the other side time to call bind() & listen() */
-                       schedule_timeout_interruptible(HZ / 10);
-               }
+               struct socket *s;
  
+               s = drbd_try_connect(tconn);
                if (s) {
-                       if (!sock) {
-                               drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
-                               sock = s;
-                               s = NULL;
-                       } else if (!msock) {
-                               drbd_clear_flag(mdev, DISCARD_CONCURRENT);
-                               drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
-                               msock = s;
-                               s = NULL;
+                       if (!sock.socket) {
+                               sock.socket = s;
+                               send_first_packet(tconn, &sock, P_INITIAL_DATA);
+                       } else if (!msock.socket) {
+                               clear_bit(RESOLVE_CONFLICTS, &tconn->flags);
+                               msock.socket = s;
+                               send_first_packet(tconn, &msock, P_INITIAL_META);
                        } else {
-                               dev_err(DEV, "Logic error in drbd_connect()\n");
+                               conn_err(tconn, "Logic error in conn_connect()\n");
                                goto out_release_sockets;
                        }
                }
  
-               if (sock && msock) {
-                       schedule_timeout_interruptible(mdev->net_conf->ping_timeo*HZ/10);
-                       ok = drbd_socket_okay(mdev, &sock);
-                       ok = drbd_socket_okay(mdev, &msock) && ok;
+               if (sock.socket && msock.socket) {
+                       rcu_read_lock();
+                       nc = rcu_dereference(tconn->net_conf);
+                       timeout = nc->ping_timeo * HZ / 10;
+                       rcu_read_unlock();
+                       schedule_timeout_interruptible(timeout);
+                       ok = drbd_socket_okay(&sock.socket);
+                       ok = drbd_socket_okay(&msock.socket) && ok;
                        if (ok)
                                break;
                }
  
  retry:
-               s = drbd_wait_for_connect(mdev);
+               s = drbd_wait_for_connect(tconn, &ad);
                if (s) {
-                       try = drbd_recv_fp(mdev, s);
-                       drbd_socket_okay(mdev, &sock);
-                       drbd_socket_okay(mdev, &msock);
-                       switch (try) {
-                       case P_HAND_SHAKE_S:
-                               if (sock) {
-                                       dev_warn(DEV, "initial packet S crossed\n");
-                                       sock_release(sock);
+                       int fp = receive_first_packet(tconn, s);
+                       drbd_socket_okay(&sock.socket);
+                       drbd_socket_okay(&msock.socket);
+                       switch (fp) {
+                       case P_INITIAL_DATA:
+                               if (sock.socket) {
+                                       conn_warn(tconn, "initial packet S crossed\n");
+                                       sock_release(sock.socket);
+                                       sock.socket = s;
+                                       goto randomize;
                                }
-                               sock = s;
+                               sock.socket = s;
                                break;
-                       case P_HAND_SHAKE_M:
-                               if (msock) {
-                                       dev_warn(DEV, "initial packet M crossed\n");
-                                       sock_release(msock);
+                       case P_INITIAL_META:
+                               set_bit(RESOLVE_CONFLICTS, &tconn->flags);
+                               if (msock.socket) {
+                                       conn_warn(tconn, "initial packet M crossed\n");
+                                       sock_release(msock.socket);
+                                       msock.socket = s;
+                                       goto randomize;
                                }
-                               msock = s;
-                               drbd_set_flag(mdev, DISCARD_CONCURRENT);
+                               msock.socket = s;
                                break;
                        default:
-                               dev_warn(DEV, "Error receiving initial packet\n");
+                               conn_warn(tconn, "Error receiving initial packet\n");
                                sock_release(s);
+ randomize:
                                if (random32() & 1)
                                        goto retry;
                        }
                }
  
-               if (mdev->state.conn <= C_DISCONNECTING)
+               if (tconn->cstate <= C_DISCONNECTING)
                        goto out_release_sockets;
                if (signal_pending(current)) {
                        flush_signals(current);
                        smp_rmb();
-                       if (get_t_state(&mdev->receiver) == Exiting)
+                       if (get_t_state(&tconn->receiver) == EXITING)
                                goto out_release_sockets;
                }
  
-               if (sock && msock) {
-                       ok = drbd_socket_okay(mdev, &sock);
-                       ok = drbd_socket_okay(mdev, &msock) && ok;
-                       if (ok)
-                               break;
-               }
-       } while (1);
+               ok = drbd_socket_okay(&sock.socket);
+               ok = drbd_socket_okay(&msock.socket) && ok;
+       } while (!ok);
+       if (ad.s_listen)
+               sock_release(ad.s_listen);
  
-       msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
-       sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
 -      sock.socket->sk->sk_reuse = 1; /* SO_REUSEADDR */
 -      msock.socket->sk->sk_reuse = 1; /* SO_REUSEADDR */
++      sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
++      msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
  
-       sock->sk->sk_allocation = GFP_NOIO;
-       msock->sk->sk_allocation = GFP_NOIO;
+       sock.socket->sk->sk_allocation = GFP_NOIO;
+       msock.socket->sk->sk_allocation = GFP_NOIO;
  
-       sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
-       msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
+       sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+       msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
  
        /* NOT YET ...
-        * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
-        * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
-        * first set it to the P_HAND_SHAKE timeout,
+        * sock.socket->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10;
+        * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+        * first set it to the P_CONNECTION_FEATURES timeout,
         * which we set to 4x the configured ping_timeout. */
-       sock->sk->sk_sndtimeo =
-       sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
+       rcu_read_lock();
+       nc = rcu_dereference(tconn->net_conf);
+       sock.socket->sk->sk_sndtimeo =
+       sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
  
-       msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
-       msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
+       msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
+       timeout = nc->timeout * HZ / 10;
+       discard_my_data = nc->discard_my_data;
+       rcu_read_unlock();
+       msock.socket->sk->sk_sndtimeo = timeout;
  
        /* we don't want delays.
         * we use TCP_CORK where appropriate, though */
-       drbd_tcp_nodelay(sock);
-       drbd_tcp_nodelay(msock);
-       mdev->data.socket = sock;
-       mdev->meta.socket = msock;
-       mdev->last_received = jiffies;
+       drbd_tcp_nodelay(sock.socket);
+       drbd_tcp_nodelay(msock.socket);
  
-       D_ASSERT(mdev->asender.task == NULL);
+       tconn->data.socket = sock.socket;
+       tconn->meta.socket = msock.socket;
+       tconn->last_received = jiffies;
  
-       h = drbd_do_handshake(mdev);
+       h = drbd_do_features(tconn);
        if (h <= 0)
                return h;
  
-       if (mdev->cram_hmac_tfm) {
+       if (tconn->cram_hmac_tfm) {
                /* drbd_request_state(mdev, NS(conn, WFAuth)); */
-               switch (drbd_do_auth(mdev)) {
+               switch (drbd_do_auth(tconn)) {
                case -1:
-                       dev_err(DEV, "Authentication of peer failed\n");
+                       conn_err(tconn, "Authentication of peer failed\n");
                        return -1;
                case 0:
-                       dev_err(DEV, "Authentication of peer failed, trying again.\n");
+                       conn_err(tconn, "Authentication of peer failed, trying again.\n");
                        return 0;
                }
        }
  
-       sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
-       sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+       tconn->data.socket->sk->sk_sndtimeo = timeout;
+       tconn->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
  
-       atomic_set(&mdev->packet_seq, 0);
-       mdev->peer_seq = 0;
-       if (drbd_send_protocol(mdev) == -1)
+       if (drbd_send_protocol(tconn) == -EOPNOTSUPP)
                return -1;
-       drbd_set_flag(mdev, STATE_SENT);
-       drbd_send_sync_param(mdev, &mdev->sync_conf);
-       drbd_send_sizes(mdev, 0, 0);
-       drbd_send_uuids(mdev);
-       drbd_send_current_state(mdev);
-       drbd_clear_flag(mdev, USE_DEGR_WFC_T);
-       drbd_clear_flag(mdev, RESIZE_PENDING);
-       spin_lock_irq(&mdev->req_lock);
-       rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL);
-       if (mdev->state.conn != C_WF_REPORT_PARAMS)
-               drbd_clear_flag(mdev, STATE_SENT);
-       spin_unlock_irq(&mdev->req_lock);
-       if (rv < SS_SUCCESS)
+       set_bit(STATE_SENT, &tconn->flags);
+       rcu_read_lock();
+       idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+               kref_get(&mdev->kref);
+               rcu_read_unlock();
+               if (discard_my_data)
+                       set_bit(DISCARD_MY_DATA, &mdev->flags);
+               else
+                       clear_bit(DISCARD_MY_DATA, &mdev->flags);
+               drbd_connected(mdev);
+               kref_put(&mdev->kref, &drbd_minor_destroy);
+               rcu_read_lock();
+       }
+       rcu_read_unlock();
+       rv = conn_request_state(tconn, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
+       if (rv < SS_SUCCESS) {
+               clear_bit(STATE_SENT, &tconn->flags);
                return 0;
+       }
  
-       drbd_thread_start(&mdev->asender);
-       mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
+       drbd_thread_start(&tconn->asender);
  
-       return 1;
+       mutex_lock(&tconn->conf_update);
+       /* The discard_my_data flag is a single-shot modifier to the next
+        * connection attempt, the handshake of which is now well underway.
+        * No need for rcu style copying of the whole struct
+        * just to clear a single value. */
+       tconn->net_conf->discard_my_data = 0;
+       mutex_unlock(&tconn->conf_update);
+       return h;
  
  out_release_sockets:
-       if (sock)
-               sock_release(sock);
-       if (msock)
-               sock_release(msock);
+       if (ad.s_listen)
+               sock_release(ad.s_listen);
+       if (sock.socket)
+               sock_release(sock.socket);
+       if (msock.socket)
+               sock_release(msock.socket);
        return -1;
  }
  
- static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
+ static int decode_header(struct drbd_tconn *tconn, void *header, struct packet_info *pi)
  {
-       union p_header *h = &mdev->data.rbuf.header;
-       int r;
-       r = drbd_recv(mdev, h, sizeof(*h));
-       if (unlikely(r != sizeof(*h))) {
-               if (!signal_pending(current))
-                       dev_warn(DEV, "short read expecting header on sock: r=%d\n", r);
-               return false;
-       }
-       if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
-               *cmd = be16_to_cpu(h->h80.command);
-               *packet_size = be16_to_cpu(h->h80.length);
-       } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
-               *cmd = be16_to_cpu(h->h95.command);
-               *packet_size = be32_to_cpu(h->h95.length);
+       unsigned int header_size = drbd_header_size(tconn);
+       if (header_size == sizeof(struct p_header100) &&
+           *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
+               struct p_header100 *h = header;
+               if (h->pad != 0) {
+                       conn_err(tconn, "Header padding is not zero\n");
+                       return -EINVAL;
+               }
+               pi->vnr = be16_to_cpu(h->volume);
+               pi->cmd = be16_to_cpu(h->command);
+               pi->size = be32_to_cpu(h->length);
+       } else if (header_size == sizeof(struct p_header95) &&
+                  *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
+               struct p_header95 *h = header;
+               pi->cmd = be16_to_cpu(h->command);
+               pi->size = be32_to_cpu(h->length);
+               pi->vnr = 0;
+       } else if (header_size == sizeof(struct p_header80) &&
+                  *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
+               struct p_header80 *h = header;
+               pi->cmd = be16_to_cpu(h->command);
+               pi->size = be16_to_cpu(h->length);
+               pi->vnr = 0;
        } else {
-               dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
-                   be32_to_cpu(h->h80.magic),
-                   be16_to_cpu(h->h80.command),
-                   be16_to_cpu(h->h80.length));
-               return false;
+               conn_err(tconn, "Wrong magic value 0x%08x in protocol version %d\n",
+                        be32_to_cpu(*(__be32 *)header),
+                        tconn->agreed_pro_version);
+               return -EINVAL;
        }
-       mdev->last_received = jiffies;
+       pi->data = header + header_size;
+       return 0;
+ }
  
-       return true;
+ static int drbd_recv_header(struct drbd_tconn *tconn, struct packet_info *pi)
+ {
+       void *buffer = tconn->data.rbuf;
+       int err;
+       err = drbd_recv_all_warn(tconn, buffer, drbd_header_size(tconn));
+       if (err)
+               return err;
+       err = decode_header(tconn, buffer, pi);
+       tconn->last_received = jiffies;
+       return err;
  }
  
- static void drbd_flush(struct drbd_conf *mdev)
+ static void drbd_flush(struct drbd_tconn *tconn)
  {
        int rv;
+       struct drbd_conf *mdev;
+       int vnr;
+       if (tconn->write_ordering >= WO_bdev_flush) {
+               rcu_read_lock();
+               idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+                       if (!get_ldev(mdev))
+                               continue;
+                       kref_get(&mdev->kref);
+                       rcu_read_unlock();
+                       rv = blkdev_issue_flush(mdev->ldev->backing_bdev,
+                                       GFP_NOIO, NULL);
+                       if (rv) {
+                               dev_info(DEV, "local disk flush failed with status %d\n", rv);
+                               /* would rather check on EOPNOTSUPP, but that is not reliable.
+                                * don't try again for ANY return value != 0
+                                * if (rv == -EOPNOTSUPP) */
+                               drbd_bump_write_ordering(tconn, WO_drain_io);
+                       }
+                       put_ldev(mdev);
+                       kref_put(&mdev->kref, &drbd_minor_destroy);
  
-       if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
-               rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_NOIO,
-                                       NULL);
-               if (rv) {
-                       dev_info(DEV, "local disk flush failed with status %d\n", rv);
-                       /* would rather check on EOPNOTSUPP, but that is not reliable.
-                        * don't try again for ANY return value != 0
-                        * if (rv == -EOPNOTSUPP) */
-                       drbd_bump_write_ordering(mdev, WO_drain_io);
+                       rcu_read_lock();
+                       if (rv)
+                               break;
                }
-               put_ldev(mdev);
+               rcu_read_unlock();
        }
  }
  
   * @epoch:    Epoch object.
   * @ev:               Epoch event.
   */
- static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
+ static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *tconn,
                                               struct drbd_epoch *epoch,
                                               enum epoch_event ev)
  {
        struct drbd_epoch *next_epoch;
        enum finish_epoch rv = FE_STILL_LIVE;
  
-       spin_lock(&mdev->epoch_lock);
+       spin_lock(&tconn->epoch_lock);
        do {
                next_epoch = NULL;
  
                    atomic_read(&epoch->active) == 0 &&
                    (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
                        if (!(ev & EV_CLEANUP)) {
-                               spin_unlock(&mdev->epoch_lock);
-                               drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
-                               spin_lock(&mdev->epoch_lock);
+                               spin_unlock(&tconn->epoch_lock);
+                               drbd_send_b_ack(epoch->tconn, epoch->barrier_nr, epoch_size);
+                               spin_lock(&tconn->epoch_lock);
                        }
+ #if 0
+                       /* FIXME: dec unacked on connection, once we have
+                        * something to count pending connection packets in. */
                        if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
-                               dec_unacked(mdev);
+                               dec_unacked(epoch->tconn);
+ #endif
  
-                       if (mdev->current_epoch != epoch) {
+                       if (tconn->current_epoch != epoch) {
                                next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
                                list_del(&epoch->list);
                                ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
-                               mdev->epochs--;
+                               tconn->epochs--;
                                kfree(epoch);
  
                                if (rv == FE_STILL_LIVE)
                                /* atomic_set(&epoch->active, 0); is already zero */
                                if (rv == FE_STILL_LIVE)
                                        rv = FE_RECYCLED;
-                               wake_up(&mdev->ee_wait);
                        }
                }
  
                epoch = next_epoch;
        } while (1);
  
-       spin_unlock(&mdev->epoch_lock);
+       spin_unlock(&tconn->epoch_lock);
  
        return rv;
  }
  
  /**
   * drbd_bump_write_ordering() - Fall back to an other write ordering method
-  * @mdev:     DRBD device.
+  * @tconn:    DRBD connection.
   * @wo:               Write ordering method to try.
   */
- void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
+ void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo)
  {
+       struct disk_conf *dc;
+       struct drbd_conf *mdev;
        enum write_ordering_e pwo;
+       int vnr;
        static char *write_ordering_str[] = {
                [WO_none] = "none",
                [WO_drain_io] = "drain",
                [WO_bdev_flush] = "flush",
        };
  
-       pwo = mdev->write_ordering;
+       pwo = tconn->write_ordering;
        wo = min(pwo, wo);
-       if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
-               wo = WO_drain_io;
-       if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
-               wo = WO_none;
-       mdev->write_ordering = wo;
-       if (pwo != mdev->write_ordering || wo == WO_bdev_flush)
-               dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
+       rcu_read_lock();
+       idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+               if (!get_ldev_if_state(mdev, D_ATTACHING))
+                       continue;
+               dc = rcu_dereference(mdev->ldev->disk_conf);
+               if (wo == WO_bdev_flush && !dc->disk_flushes)
+                       wo = WO_drain_io;
+               if (wo == WO_drain_io && !dc->disk_drain)
+                       wo = WO_none;
+               put_ldev(mdev);
+       }
+       rcu_read_unlock();
+       tconn->write_ordering = wo;
+       if (pwo != tconn->write_ordering || wo == WO_bdev_flush)
+               conn_info(tconn, "Method to ensure write ordering: %s\n", write_ordering_str[tconn->write_ordering]);
  }
  
  /**
-  * drbd_submit_ee()
+  * drbd_submit_peer_request()
   * @mdev:     DRBD device.
-  * @e:                epoch entry
+  * @peer_req: peer request
   * @rw:               flag field, see bio->bi_rw
   *
   * May spread the pages to multiple bios,
   *  on certain Xen deployments.
   */
  /* TODO allocate from our own bio_set. */
- int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
-               const unsigned rw, const int fault_type)
+ int drbd_submit_peer_request(struct drbd_conf *mdev,
+                            struct drbd_peer_request *peer_req,
+                            const unsigned rw, const int fault_type)
  {
        struct bio *bios = NULL;
        struct bio *bio;
-       struct page *page = e->pages;
-       sector_t sector = e->sector;
-       unsigned ds = e->size;
+       struct page *page = peer_req->pages;
+       sector_t sector = peer_req->i.sector;
+       unsigned ds = peer_req->i.size;
        unsigned n_bios = 0;
        unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
        int err = -ENOMEM;
@@@ -1111,12 -1320,12 +1320,12 @@@ next_bio
                dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
                goto fail;
        }
-       /* > e->sector, unless this is the first bio */
+       /* > peer_req->i.sector, unless this is the first bio */
        bio->bi_sector = sector;
        bio->bi_bdev = mdev->ldev->backing_bdev;
        bio->bi_rw = rw;
-       bio->bi_private = e;
-       bio->bi_end_io = drbd_endio_sec;
+       bio->bi_private = peer_req;
+       bio->bi_end_io = drbd_peer_request_endio;
  
        bio->bi_next = bios;
        bios = bio;
        D_ASSERT(page == NULL);
        D_ASSERT(ds == 0);
  
-       atomic_set(&e->pending_bios, n_bios);
+       atomic_set(&peer_req->pending_bios, n_bios);
        do {
                bio = bios;
                bios = bios->bi_next;
@@@ -1164,26 -1373,57 +1373,57 @@@ fail
        return err;
  }
  
- static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+ static void drbd_remove_epoch_entry_interval(struct drbd_conf *mdev,
+                                            struct drbd_peer_request *peer_req)
+ {
+       struct drbd_interval *i = &peer_req->i;
+       drbd_remove_interval(&mdev->write_requests, i);
+       drbd_clear_interval(i);
+       /* Wake up any processes waiting for this peer request to complete.  */
+       if (i->waiting)
+               wake_up(&mdev->misc_wait);
+ }
+ void conn_wait_active_ee_empty(struct drbd_tconn *tconn)
+ {
+       struct drbd_conf *mdev;
+       int vnr;
+       rcu_read_lock();
+       idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+               kref_get(&mdev->kref);
+               rcu_read_unlock();
+               drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+               kref_put(&mdev->kref, &drbd_minor_destroy);
+               rcu_read_lock();
+       }
+       rcu_read_unlock();
+ }
+ static int receive_Barrier(struct drbd_tconn *tconn, struct packet_info *pi)
  {
        int rv;
-       struct p_barrier *p = &mdev->data.rbuf.barrier;
+       struct p_barrier *p = pi->data;
        struct drbd_epoch *epoch;
  
-       inc_unacked(mdev);
-       mdev->current_epoch->barrier_nr = p->barrier;
-       rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
+       /* FIXME these are unacked on connection,
+        * not a specific (peer)device.
+        */
+       tconn->current_epoch->barrier_nr = p->barrier;
+       tconn->current_epoch->tconn = tconn;
+       rv = drbd_may_finish_epoch(tconn, tconn->current_epoch, EV_GOT_BARRIER_NR);
  
        /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
         * the activity log, which means it would not be resynced in case the
         * R_PRIMARY crashes now.
         * Therefore we must send the barrier_ack after the barrier request was
         * completed. */
-       switch (mdev->write_ordering) {
+       switch (tconn->write_ordering) {
        case WO_none:
                if (rv == FE_RECYCLED)
-                       return true;
+                       return 0;
  
                /* receiver context, in the writeout path of the other node.
                 * avoid potential distributed deadlock */
                if (epoch)
                        break;
                else
-                       dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
+                       conn_warn(tconn, "Allocation of an epoch failed, slowing down\n");
                        /* Fall through */
  
        case WO_bdev_flush:
        case WO_drain_io:
-               drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
-               drbd_flush(mdev);
+               conn_wait_active_ee_empty(tconn);
+               drbd_flush(tconn);
  
-               if (atomic_read(&mdev->current_epoch->epoch_size)) {
+               if (atomic_read(&tconn->current_epoch->epoch_size)) {
                        epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
                        if (epoch)
                                break;
                }
  
-               epoch = mdev->current_epoch;
-               wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0);
-               D_ASSERT(atomic_read(&epoch->active) == 0);
-               D_ASSERT(epoch->flags == 0);
-               return true;
+               return 0;
        default:
-               dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
-               return false;
+               conn_err(tconn, "Strangeness in tconn->write_ordering %d\n", tconn->write_ordering);
+               return -EIO;
        }
  
        epoch->flags = 0;
        atomic_set(&epoch->epoch_size, 0);
        atomic_set(&epoch->active, 0);
  
-       spin_lock(&mdev->epoch_lock);
-       if (atomic_read(&mdev->current_epoch->epoch_size)) {
-               list_add(&epoch->list, &mdev->current_epoch->list);
-               mdev->current_epoch = epoch;
-               mdev->epochs++;
+       spin_lock(&tconn->epoch_lock);
+       if (atomic_read(&tconn->current_epoch->epoch_size)) {
+               list_add(&epoch->list, &tconn->current_epoch->list);
+               tconn->current_epoch = epoch;
+               tconn->epochs++;
        } else {
                /* The current_epoch got recycled while we allocated this one... */
                kfree(epoch);
        }
-       spin_unlock(&mdev->epoch_lock);
+       spin_unlock(&tconn->epoch_lock);
  
-       return true;
+       return 0;
  }
  
  /* used from receive_RSDataReply (recv_resync_read)
   * and from receive_Data */
- static struct drbd_epoch_entry *
- read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
+ static struct drbd_peer_request *
+ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector,
+             int data_size) __must_hold(local)
  {
        const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
-       struct drbd_epoch_entry *e;
+       struct drbd_peer_request *peer_req;
        struct page *page;
-       int dgs, ds, rr;
-       void *dig_in = mdev->int_dig_in;
-       void *dig_vv = mdev->int_dig_vv;
+       int dgs, ds, err;
+       void *dig_in = mdev->tconn->int_dig_in;
+       void *dig_vv = mdev->tconn->int_dig_vv;
        unsigned long *data;
  
-       dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
-               crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
-       if (dgs) {
-               rr = drbd_recv(mdev, dig_in, dgs);
-               if (rr != dgs) {
-                       if (!signal_pending(current))
-                               dev_warn(DEV,
-                                       "short read receiving data digest: read %d expected %d\n",
-                                       rr, dgs);
+       dgs = 0;
+       if (mdev->tconn->peer_integrity_tfm) {
+               dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
+               /*
+                * FIXME: Receive the incoming digest into the receive buffer
+                *        here, together with its struct p_data?
+                */
+               err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs);
+               if (err)
                        return NULL;
-               }
+               data_size -= dgs;
        }
  
-       data_size -= dgs;
-       ERR_IF(data_size &  0x1ff) return NULL;
-       ERR_IF(data_size >  DRBD_MAX_BIO_SIZE) return NULL;
+       if (!expect(IS_ALIGNED(data_size, 512)))
+               return NULL;
+       if (!expect(data_size <= DRBD_MAX_BIO_SIZE))
+               return NULL;
  
        /* even though we trust out peer,
         * we sometimes have to double check. */
        /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
         * "criss-cross" setup, that might cause write-out on some other DRBD,
         * which in turn might block on the other node at this very place.  */
-       e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
-       if (!e)
+       peer_req = drbd_alloc_peer_req(mdev, id, sector, data_size, GFP_NOIO);
+       if (!peer_req)
                return NULL;
  
        if (!data_size)
-               return e;
+               return peer_req;
  
        ds = data_size;
-       page = e->pages;
+       page = peer_req->pages;
        page_chain_for_each(page) {
                unsigned len = min_t(int, ds, PAGE_SIZE);
                data = kmap(page);
-               rr = drbd_recv(mdev, data, len);
+               err = drbd_recv_all_warn(mdev->tconn, data, len);
                if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) {
                        dev_err(DEV, "Fault injection: Corrupting data on receive\n");
                        data[0] = data[0] ^ (unsigned long)-1;
                }
                kunmap(page);
-               if (rr != len) {
-                       drbd_free_ee(mdev, e);
-                       if (!signal_pending(current))
-                               dev_warn(DEV, "short read receiving data: read %d expected %d\n",
-                               rr, len);
+               if (err) {
+                       drbd_free_peer_req(mdev, peer_req);
                        return NULL;
                }
-               ds -= rr;
+               ds -= len;
        }
  
        if (dgs) {
-               drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
+               drbd_csum_ee(mdev, mdev->tconn->peer_integrity_tfm, peer_req, dig_vv);
                if (memcmp(dig_in, dig_vv, dgs)) {
                        dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n",
                                (unsigned long long)sector, data_size);
-                       drbd_bcast_ee(mdev, "digest failed",
-                                       dgs, dig_in, dig_vv, e);
-                       drbd_free_ee(mdev, e);
+                       drbd_free_peer_req(mdev, peer_req);
                        return NULL;
                }
        }
        mdev->recv_cnt += data_size>>9;
-       return e;
+       return peer_req;
  }
  
  /* drbd_drain_block() just takes a data block
  static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
  {
        struct page *page;
-       int rr, rv = 1;
+       int err = 0;
        void *data;
  
        if (!data_size)
-               return true;
+               return 0;
  
-       page = drbd_pp_alloc(mdev, 1, 1);
+       page = drbd_alloc_pages(mdev, 1, 1);
  
        data = kmap(page);
        while (data_size) {
-               rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
-               if (rr != min_t(int, data_size, PAGE_SIZE)) {
-                       rv = 0;
-                       if (!signal_pending(current))
-                               dev_warn(DEV,
-                                       "short read receiving data: read %d expected %d\n",
-                                       rr, min_t(int, data_size, PAGE_SIZE));
+               unsigned int len = min_t(int, data_size, PAGE_SIZE);
+               err = drbd_recv_all_warn(mdev->tconn, data, len);
+               if (err)
                        break;
-               }
-               data_size -= rr;
+               data_size -= len;
        }
        kunmap(page);
-       drbd_pp_free(mdev, page, 0);
-       return rv;
+       drbd_free_pages(mdev, page, 0);
+       return err;
  }
  
  static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
  {
        struct bio_vec *bvec;
        struct bio *bio;
-       int dgs, rr, i, expect;
-       void *dig_in = mdev->int_dig_in;
-       void *dig_vv = mdev->int_dig_vv;
-       dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
-               crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
+       int dgs, err, i, expect;
+       void *dig_in = mdev->tconn->int_dig_in;
+       void *dig_vv = mdev->tconn->int_dig_vv;
  
-       if (dgs) {
-               rr = drbd_recv(mdev, dig_in, dgs);
-               if (rr != dgs) {
-                       if (!signal_pending(current))
-                               dev_warn(DEV,
-                                       "short read receiving data reply digest: read %d expected %d\n",
-                                       rr, dgs);
-                       return 0;
-               }
+       dgs = 0;
+       if (mdev->tconn->peer_integrity_tfm) {
+               dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
+               err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs);
+               if (err)
+                       return err;
+               data_size -= dgs;
        }
  
-       data_size -= dgs;
        /* optimistically update recv_cnt.  if receiving fails below,
         * we disconnect anyways, and counters will be reset. */
        mdev->recv_cnt += data_size>>9;
        D_ASSERT(sector == bio->bi_sector);
  
        bio_for_each_segment(bvec, bio, i) {
+               void *mapped = kmap(bvec->bv_page) + bvec->bv_offset;
                expect = min_t(int, data_size, bvec->bv_len);
-               rr = drbd_recv(mdev,
-                            kmap(bvec->bv_page)+bvec->bv_offset,
-                            expect);
+               err = drbd_recv_all_warn(mdev->tconn, mapped, expect);
                kunmap(bvec->bv_page);
-               if (rr != expect) {
-                       if (!signal_pending(current))
-                               dev_warn(DEV, "short read receiving data reply: "
-                                       "read %d expected %d\n",
-                                       rr, expect);
-                       return 0;
-               }
-               data_size -= rr;
+               if (err)
+                       return err;
+               data_size -= expect;
        }
  
        if (dgs) {
-               drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+               drbd_csum_bio(mdev, mdev->tconn->peer_integrity_tfm, bio, dig_vv);
                if (memcmp(dig_in, dig_vv, dgs)) {
                        dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
-                       return 0;
+                       return -EINVAL;
                }
        }
  
        D_ASSERT(data_size == 0);
-       return 1;
+       return 0;
  }
  
- /* e_end_resync_block() is called via
-  * drbd_process_done_ee() by asender only */
- static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+ /*
+  * e_end_resync_block() is called in asender context via
+  * drbd_finish_peer_reqs().
+  */
+ static int e_end_resync_block(struct drbd_work *w, int unused)
  {
-       struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
-       sector_t sector = e->sector;
-       int ok;
+       struct drbd_peer_request *peer_req =
+               container_of(w, struct drbd_peer_request, w);
+       struct drbd_conf *mdev = w->mdev;
+       sector_t sector = peer_req->i.sector;
+       int err;
  
-       D_ASSERT(hlist_unhashed(&e->collision));
+       D_ASSERT(drbd_interval_empty(&peer_req->i));
  
-       if (likely((e->flags & EE_WAS_ERROR) == 0)) {
-               drbd_set_in_sync(mdev, sector, e->size);
-               ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
+       if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+               drbd_set_in_sync(mdev, sector, peer_req->i.size);
+               err = drbd_send_ack(mdev, P_RS_WRITE_ACK, peer_req);
        } else {
                /* Record failure to sync */
-               drbd_rs_failed_io(mdev, sector, e->size);
+               drbd_rs_failed_io(mdev, sector, peer_req->i.size);
  
-               ok  = drbd_send_ack(mdev, P_NEG_ACK, e);
+               err  = drbd_send_ack(mdev, P_NEG_ACK, peer_req);
        }
        dec_unacked(mdev);
  
-       return ok;
+       return err;
  }
  
  static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
  {
-       struct drbd_epoch_entry *e;
+       struct drbd_peer_request *peer_req;
  
-       e = read_in_block(mdev, ID_SYNCER, sector, data_size);
-       if (!e)
+       peer_req = read_in_block(mdev, ID_SYNCER, sector, data_size);
+       if (!peer_req)
                goto fail;
  
        dec_rs_pending(mdev);
        /* corresponding dec_unacked() in e_end_resync_block()
         * respective _drbd_clear_done_ee */
  
-       e->w.cb = e_end_resync_block;
+       peer_req->w.cb = e_end_resync_block;
  
-       spin_lock_irq(&mdev->req_lock);
-       list_add(&e->w.list, &mdev->sync_ee);
-       spin_unlock_irq(&mdev->req_lock);
+       spin_lock_irq(&mdev->tconn->req_lock);
+       list_add(&peer_req->w.list, &mdev->sync_ee);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
        atomic_add(data_size >> 9, &mdev->rs_sect_ev);
-       if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
-               return true;
+       if (drbd_submit_peer_request(mdev, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
+               return 0;
  
        /* don't care for the reason here */
        dev_err(DEV, "submit failed, triggering re-connect\n");
-       spin_lock_irq(&mdev->req_lock);
-       list_del(&e->w.list);
-       spin_unlock_irq(&mdev->req_lock);
+       spin_lock_irq(&mdev->tconn->req_lock);
+       list_del(&peer_req->w.list);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
-       drbd_free_ee(mdev, e);
+       drbd_free_peer_req(mdev, peer_req);
  fail:
        put_ldev(mdev);
-       return false;
+       return -EIO;
+ }
+ static struct drbd_request *
+ find_request(struct drbd_conf *mdev, struct rb_root *root, u64 id,
+            sector_t sector, bool missing_ok, const char *func)
+ {
+       struct drbd_request *req;
+       /* Request object according to our peer */
+       req = (struct drbd_request *)(unsigned long)id;
+       if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
+               return req;
+       if (!missing_ok) {
+               dev_err(DEV, "%s: failed to find request 0x%lx, sector %llus\n", func,
+                       (unsigned long)id, (unsigned long long)sector);
+       }
+       return NULL;
  }
  
- static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+ static int receive_DataReply(struct drbd_tconn *tconn, struct packet_info *pi)
  {
+       struct drbd_conf *mdev;
        struct drbd_request *req;
        sector_t sector;
-       int ok;
-       struct p_data *p = &mdev->data.rbuf.data;
+       int err;
+       struct p_data *p = pi->data;
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return -EIO;
  
        sector = be64_to_cpu(p->sector);
  
-       spin_lock_irq(&mdev->req_lock);
-       req = _ar_id_to_req(mdev, p->block_id, sector);
-       spin_unlock_irq(&mdev->req_lock);
-       if (unlikely(!req)) {
-               dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
-               return false;
-       }
+       spin_lock_irq(&mdev->tconn->req_lock);
+       req = find_request(mdev, &mdev->read_requests, p->block_id, sector, false, __func__);
+       spin_unlock_irq(&mdev->tconn->req_lock);
+       if (unlikely(!req))
+               return -EIO;
  
        /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
         * special casing it there for the various failure cases.
         * still no race with drbd_fail_pending_reads */
-       ok = recv_dless_read(mdev, req, sector, data_size);
-       if (ok)
-               req_mod(req, data_received);
+       err = recv_dless_read(mdev, req, sector, pi->size);
+       if (!err)
+               req_mod(req, DATA_RECEIVED);
        /* else: nothing. handled from drbd_disconnect...
         * I don't think we may complete this just yet
         * in case we are "on-disconnect: freeze" */
  
-       return ok;
+       return err;
  }
  
- static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+ static int receive_RSDataReply(struct drbd_tconn *tconn, struct packet_info *pi)
  {
+       struct drbd_conf *mdev;
        sector_t sector;
-       int ok;
-       struct p_data *p = &mdev->data.rbuf.data;
+       int err;
+       struct p_data *p = pi->data;
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return -EIO;
  
        sector = be64_to_cpu(p->sector);
        D_ASSERT(p->block_id == ID_SYNCER);
        if (get_ldev(mdev)) {
                /* data is submitted to disk within recv_resync_read.
                 * corresponding put_ldev done below on error,
-                * or in drbd_endio_write_sec. */
-               ok = recv_resync_read(mdev, sector, data_size);
+                * or in drbd_peer_request_endio. */
+               err = recv_resync_read(mdev, sector, pi->size);
        } else {
                if (__ratelimit(&drbd_ratelimit_state))
                        dev_err(DEV, "Can not write resync data to local disk.\n");
  
-               ok = drbd_drain_block(mdev, data_size);
+               err = drbd_drain_block(mdev, pi->size);
  
-               drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
+               drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size);
        }
  
-       atomic_add(data_size >> 9, &mdev->rs_sect_in);
+       atomic_add(pi->size >> 9, &mdev->rs_sect_in);
  
-       return ok;
+       return err;
  }
  
- /* e_end_block() is called via drbd_process_done_ee().
-  * this means this function only runs in the asender thread
-  */
- static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+ static void restart_conflicting_writes(struct drbd_conf *mdev,
+                                      sector_t sector, int size)
  {
-       struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
-       sector_t sector = e->sector;
-       int ok = 1, pcmd;
+       struct drbd_interval *i;
+       struct drbd_request *req;
+       drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
+               if (!i->local)
+                       continue;
+               req = container_of(i, struct drbd_request, i);
+               if (req->rq_state & RQ_LOCAL_PENDING ||
+                   !(req->rq_state & RQ_POSTPONED))
+                       continue;
+               /* as it is RQ_POSTPONED, this will cause it to
+                * be queued on the retry workqueue. */
+               __req_mod(req, CONFLICT_RESOLVED, NULL);
+       }
+ }
  
-       if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
-               if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+ /*
+  * e_end_block() is called in asender context via drbd_finish_peer_reqs().
+  */
+ static int e_end_block(struct drbd_work *w, int cancel)
+ {
+       struct drbd_peer_request *peer_req =
+               container_of(w, struct drbd_peer_request, w);
+       struct drbd_conf *mdev = w->mdev;
+       sector_t sector = peer_req->i.sector;
+       int err = 0, pcmd;
+       if (peer_req->flags & EE_SEND_WRITE_ACK) {
+               if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
                        pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
                                mdev->state.conn <= C_PAUSED_SYNC_T &&
-                               e->flags & EE_MAY_SET_IN_SYNC) ?
+                               peer_req->flags & EE_MAY_SET_IN_SYNC) ?
                                P_RS_WRITE_ACK : P_WRITE_ACK;
-                       ok &= drbd_send_ack(mdev, pcmd, e);
+                       err = drbd_send_ack(mdev, pcmd, peer_req);
                        if (pcmd == P_RS_WRITE_ACK)
-                               drbd_set_in_sync(mdev, sector, e->size);
+                               drbd_set_in_sync(mdev, sector, peer_req->i.size);
                } else {
-                       ok  = drbd_send_ack(mdev, P_NEG_ACK, e);
+                       err = drbd_send_ack(mdev, P_NEG_ACK, peer_req);
                        /* we expect it to be marked out of sync anyways...
                         * maybe assert this?  */
                }
        }
        /* we delete from the conflict detection hash _after_ we sent out the
         * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
-       if (mdev->net_conf->two_primaries) {
-               spin_lock_irq(&mdev->req_lock);
-               D_ASSERT(!hlist_unhashed(&e->collision));
-               hlist_del_init(&e->collision);
-               spin_unlock_irq(&mdev->req_lock);
-       } else {
-               D_ASSERT(hlist_unhashed(&e->collision));
-       }
+       if (peer_req->flags & EE_IN_INTERVAL_TREE) {
+               spin_lock_irq(&mdev->tconn->req_lock);
+               D_ASSERT(!drbd_interval_empty(&peer_req->i));
+               drbd_remove_epoch_entry_interval(mdev, peer_req);
+               if (peer_req->flags & EE_RESTART_REQUESTS)
+                       restart_conflicting_writes(mdev, sector, peer_req->i.size);
+               spin_unlock_irq(&mdev->tconn->req_lock);
+       } else
+               D_ASSERT(drbd_interval_empty(&peer_req->i));
+       drbd_may_finish_epoch(mdev->tconn, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+       return err;
+ }
+ static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
+ {
+       struct drbd_conf *mdev = w->mdev;
+       struct drbd_peer_request *peer_req =
+               container_of(w, struct drbd_peer_request, w);
+       int err;
+       err = drbd_send_ack(mdev, ack, peer_req);
+       dec_unacked(mdev);
+       return err;
+ }
+ static int e_send_superseded(struct drbd_work *w, int unused)
+ {
+       return e_send_ack(w, P_SUPERSEDED);
+ }
+ static int e_send_retry_write(struct drbd_work *w, int unused)
+ {
+       struct drbd_tconn *tconn = w->mdev->tconn;
+       return e_send_ack(w, tconn->agreed_pro_version >= 100 ?
+                            P_RETRY_WRITE : P_SUPERSEDED);
+ }
  
-       drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+ static bool seq_greater(u32 a, u32 b)
+ {
+       /*
+        * We assume 32-bit wrap-around here.
+        * For 24-bit wrap-around, we would have to shift:
+        *  a <<= 8; b <<= 8;
+        */
+       return (s32)a - (s32)b > 0;
+ }
  
-       return ok;
+ static u32 seq_max(u32 a, u32 b)
+ {
+       return seq_greater(a, b) ? a : b;
  }
  
- static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+ static bool need_peer_seq(struct drbd_conf *mdev)
  {
-       struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
-       int ok = 1;
+       struct drbd_tconn *tconn = mdev->tconn;
+       int tp;
  
-       D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
-       ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
+       /*
+        * We only need to keep track of the last packet_seq number of our peer
+        * if we are in dual-primary mode and we have the resolve-conflicts flag set; see
+        * handle_write_conflicts().
+        */
  
-       spin_lock_irq(&mdev->req_lock);
-       D_ASSERT(!hlist_unhashed(&e->collision));
-       hlist_del_init(&e->collision);
-       spin_unlock_irq(&mdev->req_lock);
+       rcu_read_lock();
+       tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
+       rcu_read_unlock();
  
-       dec_unacked(mdev);
+       return tp && test_bit(RESOLVE_CONFLICTS, &tconn->flags);
+ }
  
-       return ok;
+ static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq)
+ {
+       unsigned int newest_peer_seq;
+       if (need_peer_seq(mdev)) {
+               spin_lock(&mdev->peer_seq_lock);
+               newest_peer_seq = seq_max(mdev->peer_seq, peer_seq);
+               mdev->peer_seq = newest_peer_seq;
+               spin_unlock(&mdev->peer_seq_lock);
+               /* wake up only if we actually changed mdev->peer_seq */
+               if (peer_seq == newest_peer_seq)
+                       wake_up(&mdev->seq_wait);
+       }
  }
  
- static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e)
+ static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
  {
+       return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
+ }
  
-       struct drbd_epoch_entry *rs_e;
+ /* maybe change sync_ee into interval trees as well? */
+ static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_peer_request *peer_req)
+ {
+       struct drbd_peer_request *rs_req;
        bool rv = 0;
  
-       spin_lock_irq(&mdev->req_lock);
-       list_for_each_entry(rs_e, &mdev->sync_ee, w.list) {
-               if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) {
+       spin_lock_irq(&mdev->tconn->req_lock);
+       list_for_each_entry(rs_req, &mdev->sync_ee, w.list) {
+               if (overlaps(peer_req->i.sector, peer_req->i.size,
+                            rs_req->i.sector, rs_req->i.size)) {
                        rv = 1;
                        break;
                }
        }
-       spin_unlock_irq(&mdev->req_lock);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
        return rv;
  }
   *
   * returns 0 if we may process the packet,
   * -ERESTARTSYS if we were interrupted (by disconnect signal). */
- static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
+ static int wait_for_and_update_peer_seq(struct drbd_conf *mdev, const u32 peer_seq)
  {
        DEFINE_WAIT(wait);
-       unsigned int p_seq;
        long timeout;
-       int ret = 0;
+       int ret;
+       if (!need_peer_seq(mdev))
+               return 0;
        spin_lock(&mdev->peer_seq_lock);
        for (;;) {
-               prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
-               if (seq_le(packet_seq, mdev->peer_seq+1))
+               if (!seq_greater(peer_seq - 1, mdev->peer_seq)) {
+                       mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq);
+                       ret = 0;
                        break;
+               }
                if (signal_pending(current)) {
                        ret = -ERESTARTSYS;
                        break;
                }
-               p_seq = mdev->peer_seq;
+               prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
                spin_unlock(&mdev->peer_seq_lock);
-               timeout = schedule_timeout(30*HZ);
+               rcu_read_lock();
+               timeout = rcu_dereference(mdev->tconn->net_conf)->ping_timeo*HZ/10;
+               rcu_read_unlock();
+               timeout = schedule_timeout(timeout);
                spin_lock(&mdev->peer_seq_lock);
-               if (timeout == 0 && p_seq == mdev->peer_seq) {
+               if (!timeout) {
                        ret = -ETIMEDOUT;
-                       dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
+                       dev_err(DEV, "Timed out waiting for missing ack packets; disconnecting\n");
                        break;
                }
        }
-       finish_wait(&mdev->seq_wait, &wait);
-       if (mdev->peer_seq+1 == packet_seq)
-               mdev->peer_seq++;
        spin_unlock(&mdev->peer_seq_lock);
+       finish_wait(&mdev->seq_wait, &wait);
        return ret;
  }
  
@@@ -1675,233 -2005,277 +2005,277 @@@ static unsigned long wire_flags_to_bio(
                (dpf & DP_DISCARD ? REQ_DISCARD : 0);
  }
  
+ static void fail_postponed_requests(struct drbd_conf *mdev, sector_t sector,
+                                   unsigned int size)
+ {
+       struct drbd_interval *i;
+     repeat:
+       drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
+               struct drbd_request *req;
+               struct bio_and_error m;
+               if (!i->local)
+                       continue;
+               req = container_of(i, struct drbd_request, i);
+               if (!(req->rq_state & RQ_POSTPONED))
+                       continue;
+               req->rq_state &= ~RQ_POSTPONED;
+               __req_mod(req, NEG_ACKED, &m);
+               spin_unlock_irq(&mdev->tconn->req_lock);
+               if (m.bio)
+                       complete_master_bio(mdev, &m);
+               spin_lock_irq(&mdev->tconn->req_lock);
+               goto repeat;
+       }
+ }
+ static int handle_write_conflicts(struct drbd_conf *mdev,
+                                 struct drbd_peer_request *peer_req)
+ {
+       struct drbd_tconn *tconn = mdev->tconn;
+       bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &tconn->flags);
+       sector_t sector = peer_req->i.sector;
+       const unsigned int size = peer_req->i.size;
+       struct drbd_interval *i;
+       bool equal;
+       int err;
+       /*
+        * Inserting the peer request into the write_requests tree will prevent
+        * new conflicting local requests from being added.
+        */
+       drbd_insert_interval(&mdev->write_requests, &peer_req->i);
+     repeat:
+       drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
+               if (i == &peer_req->i)
+                       continue;
+               if (!i->local) {
+                       /*
+                        * Our peer has sent a conflicting remote request; this
+                        * should not happen in a two-node setup.  Wait for the
+                        * earlier peer request to complete.
+                        */
+                       err = drbd_wait_misc(mdev, i);
+                       if (err)
+                               goto out;
+                       goto repeat;
+               }
+               equal = i->sector == sector && i->size == size;
+               if (resolve_conflicts) {
+                       /*
+                        * If the peer request is fully contained within the
+                        * overlapping request, it can be considered overwritten
+                        * and thus superseded; otherwise, it will be retried
+                        * once all overlapping requests have completed.
+                        */
+                       bool superseded = i->sector <= sector && i->sector +
+                                      (i->size >> 9) >= sector + (size >> 9);
+                       if (!equal)
+                               dev_alert(DEV, "Concurrent writes detected: "
+                                              "local=%llus +%u, remote=%llus +%u, "
+                                              "assuming %s came first\n",
+                                         (unsigned long long)i->sector, i->size,
+                                         (unsigned long long)sector, size,
+                                         superseded ? "local" : "remote");
+                       inc_unacked(mdev);
+                       peer_req->w.cb = superseded ? e_send_superseded :
+                                                  e_send_retry_write;
+                       list_add_tail(&peer_req->w.list, &mdev->done_ee);
+                       wake_asender(mdev->tconn);
+                       err = -ENOENT;
+                       goto out;
+               } else {
+                       struct drbd_request *req =
+                               container_of(i, struct drbd_request, i);
+                       if (!equal)
+                               dev_alert(DEV, "Concurrent writes detected: "
+                                              "local=%llus +%u, remote=%llus +%u\n",
+                                         (unsigned long long)i->sector, i->size,
+                                         (unsigned long long)sector, size);
+                       if (req->rq_state & RQ_LOCAL_PENDING ||
+                           !(req->rq_state & RQ_POSTPONED)) {
+                               /*
+                                * Wait for the node with the discard flag to
+                                * decide if this request has been superseded
+                                * or needs to be retried.
+                                * Requests that have been superseded will
+                                * disappear from the write_requests tree.
+                                *
+                                * In addition, wait for the conflicting
+                                * request to finish locally before submitting
+                                * the conflicting peer request.
+                                */
+                               err = drbd_wait_misc(mdev, &req->i);
+                               if (err) {
+                                       _conn_request_state(mdev->tconn,
+                                                           NS(conn, C_TIMEOUT),
+                                                           CS_HARD);
+                                       fail_postponed_requests(mdev, sector, size);
+                                       goto out;
+                               }
+                               goto repeat;
+                       }
+                       /*
+                        * Remember to restart the conflicting requests after
+                        * the new peer request has completed.
+                        */
+                       peer_req->flags |= EE_RESTART_REQUESTS;
+               }
+       }
+       err = 0;
+     out:
+       if (err)
+               drbd_remove_epoch_entry_interval(mdev, peer_req);
+       return err;
+ }
  /* mirrored write */
- static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+ static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi)
  {
+       struct drbd_conf *mdev;
        sector_t sector;
-       struct drbd_epoch_entry *e;
-       struct p_data *p = &mdev->data.rbuf.data;
+       struct drbd_peer_request *peer_req;
+       struct p_data *p = pi->data;
+       u32 peer_seq = be32_to_cpu(p->seq_num);
        int rw = WRITE;
        u32 dp_flags;
+       int err, tp;
  
-       if (!get_ldev(mdev)) {
-               spin_lock(&mdev->peer_seq_lock);
-               if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
-                       mdev->peer_seq++;
-               spin_unlock(&mdev->peer_seq_lock);
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return -EIO;
  
-               drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
-               atomic_inc(&mdev->current_epoch->epoch_size);
-               return drbd_drain_block(mdev, data_size);
+       if (!get_ldev(mdev)) {
+               int err2;
+               err = wait_for_and_update_peer_seq(mdev, peer_seq);
+               drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size);
+               atomic_inc(&tconn->current_epoch->epoch_size);
+               err2 = drbd_drain_block(mdev, pi->size);
+               if (!err)
+                       err = err2;
+               return err;
        }
  
-       /* get_ldev(mdev) successful.
-        * Corresponding put_ldev done either below (on various errors),
-        * or in drbd_endio_write_sec, if we successfully submit the data at
-        * the end of this function. */
+       /*
+        * Corresponding put_ldev done either below (on various errors), or in
+        * drbd_peer_request_endio, if we successfully submit the data at the
+        * end of this function.
+        */
  
        sector = be64_to_cpu(p->sector);
-       e = read_in_block(mdev, p->block_id, sector, data_size);
-       if (!e) {
+       peer_req = read_in_block(mdev, p->block_id, sector, pi->size);
+       if (!peer_req) {
                put_ldev(mdev);
-               return false;
+               return -EIO;
        }
  
-       e->w.cb = e_end_block;
+       peer_req->w.cb = e_end_block;
  
        dp_flags = be32_to_cpu(p->dp_flags);
        rw |= wire_flags_to_bio(mdev, dp_flags);
-       if (e->pages == NULL) {
-               D_ASSERT(e->size == 0);
+       if (peer_req->pages == NULL) {
+               D_ASSERT(peer_req->i.size == 0);
                D_ASSERT(dp_flags & DP_FLUSH);
        }
  
        if (dp_flags & DP_MAY_SET_IN_SYNC)
-               e->flags |= EE_MAY_SET_IN_SYNC;
-       spin_lock(&mdev->epoch_lock);
-       e->epoch = mdev->current_epoch;
-       atomic_inc(&e->epoch->epoch_size);
-       atomic_inc(&e->epoch->active);
-       spin_unlock(&mdev->epoch_lock);
-       /* I'm the receiver, I do hold a net_cnt reference. */
-       if (!mdev->net_conf->two_primaries) {
-               spin_lock_irq(&mdev->req_lock);
-       } else {
-               /* don't get the req_lock yet,
-                * we may sleep in drbd_wait_peer_seq */
-               const int size = e->size;
-               const int discard = drbd_test_flag(mdev, DISCARD_CONCURRENT);
-               DEFINE_WAIT(wait);
-               struct drbd_request *i;
-               struct hlist_node *n;
-               struct hlist_head *slot;
-               int first;
-               D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
-               BUG_ON(mdev->ee_hash == NULL);
-               BUG_ON(mdev->tl_hash == NULL);
-               /* conflict detection and handling:
-                * 1. wait on the sequence number,
-                *    in case this data packet overtook ACK packets.
-                * 2. check our hash tables for conflicting requests.
-                *    we only need to walk the tl_hash, since an ee can not
-                *    have a conflict with an other ee: on the submitting
-                *    node, the corresponding req had already been conflicting,
-                *    and a conflicting req is never sent.
-                *
-                * Note: for two_primaries, we are protocol C,
-                * so there cannot be any request that is DONE
-                * but still on the transfer log.
-                *
-                * unconditionally add to the ee_hash.
-                *
-                * if no conflicting request is found:
-                *    submit.
-                *
-                * if any conflicting request is found
-                * that has not yet been acked,
-                * AND I have the "discard concurrent writes" flag:
-                *       queue (via done_ee) the P_DISCARD_ACK; OUT.
-                *
-                * if any conflicting request is found:
-                *       block the receiver, waiting on misc_wait
-                *       until no more conflicting requests are there,
-                *       or we get interrupted (disconnect).
-                *
-                *       we do not just write after local io completion of those
-                *       requests, but only after req is done completely, i.e.
-                *       we wait for the P_DISCARD_ACK to arrive!
-                *
-                *       then proceed normally, i.e. submit.
-                */
-               if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
+               peer_req->flags |= EE_MAY_SET_IN_SYNC;
+       spin_lock(&tconn->epoch_lock);
+       peer_req->epoch = tconn->current_epoch;
+       atomic_inc(&peer_req->epoch->epoch_size);
+       atomic_inc(&peer_req->epoch->active);
+       spin_unlock(&tconn->epoch_lock);
+       rcu_read_lock();
+       tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
+       rcu_read_unlock();
+       if (tp) {
+               peer_req->flags |= EE_IN_INTERVAL_TREE;
+               err = wait_for_and_update_peer_seq(mdev, peer_seq);
+               if (err)
                        goto out_interrupted;
-               spin_lock_irq(&mdev->req_lock);
-               hlist_add_head(&e->collision, ee_hash_slot(mdev, sector));
- #define OVERLAPS overlaps(i->sector, i->size, sector, size)
-               slot = tl_hash_slot(mdev, sector);
-               first = 1;
-               for (;;) {
-                       int have_unacked = 0;
-                       int have_conflict = 0;
-                       prepare_to_wait(&mdev->misc_wait, &wait,
-                               TASK_INTERRUPTIBLE);
-                       hlist_for_each_entry(i, n, slot, collision) {
-                               if (OVERLAPS) {
-                                       /* only ALERT on first iteration,
-                                        * we may be woken up early... */
-                                       if (first)
-                                               dev_alert(DEV, "%s[%u] Concurrent local write detected!"
-                                                     " new: %llus +%u; pending: %llus +%u\n",
-                                                     current->comm, current->pid,
-                                                     (unsigned long long)sector, size,
-                                                     (unsigned long long)i->sector, i->size);
-                                       if (i->rq_state & RQ_NET_PENDING)
-                                               ++have_unacked;
-                                       ++have_conflict;
-                               }
-                       }
- #undef OVERLAPS
-                       if (!have_conflict)
-                               break;
-                       /* Discard Ack only for the _first_ iteration */
-                       if (first && discard && have_unacked) {
-                               dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
-                                    (unsigned long long)sector);
-                               inc_unacked(mdev);
-                               e->w.cb = e_send_discard_ack;
-                               list_add_tail(&e->w.list, &mdev->done_ee);
-                               spin_unlock_irq(&mdev->req_lock);
-                               /* we could probably send that P_DISCARD_ACK ourselves,
-                                * but I don't like the receiver using the msock */
+               spin_lock_irq(&mdev->tconn->req_lock);
+               err = handle_write_conflicts(mdev, peer_req);
+               if (err) {
+                       spin_unlock_irq(&mdev->tconn->req_lock);
+                       if (err == -ENOENT) {
                                put_ldev(mdev);
-                               wake_asender(mdev);
-                               finish_wait(&mdev->misc_wait, &wait);
-                               return true;
+                               return 0;
                        }
+                       goto out_interrupted;
+               }
+       } else
+               spin_lock_irq(&mdev->tconn->req_lock);
+       list_add(&peer_req->w.list, &mdev->active_ee);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
-                       if (signal_pending(current)) {
-                               hlist_del_init(&e->collision);
-                               spin_unlock_irq(&mdev->req_lock);
-                               finish_wait(&mdev->misc_wait, &wait);
-                               goto out_interrupted;
-                       }
+       if (mdev->state.conn == C_SYNC_TARGET)
+               wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, peer_req));
  
-                       spin_unlock_irq(&mdev->req_lock);
-                       if (first) {
-                               first = 0;
-                               dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
-                                    "sec=%llus\n", (unsigned long long)sector);
-                       } else if (discard) {
-                               /* we had none on the first iteration.
-                                * there must be none now. */
-                               D_ASSERT(have_unacked == 0);
-                       }
-                       schedule();
-                       spin_lock_irq(&mdev->req_lock);
+       if (mdev->tconn->agreed_pro_version < 100) {
+               rcu_read_lock();
+               switch (rcu_dereference(mdev->tconn->net_conf)->wire_protocol) {
+               case DRBD_PROT_C:
+                       dp_flags |= DP_SEND_WRITE_ACK;
+                       break;
+               case DRBD_PROT_B:
+                       dp_flags |= DP_SEND_RECEIVE_ACK;
+                       break;
                }
-               finish_wait(&mdev->misc_wait, &wait);
+               rcu_read_unlock();
        }
  
-       list_add(&e->w.list, &mdev->active_ee);
-       spin_unlock_irq(&mdev->req_lock);
-       if (mdev->state.conn == C_SYNC_TARGET)
-               wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e));
-       switch (mdev->net_conf->wire_protocol) {
-       case DRBD_PROT_C:
+       if (dp_flags & DP_SEND_WRITE_ACK) {
+               peer_req->flags |= EE_SEND_WRITE_ACK;
                inc_unacked(mdev);
                /* corresponding dec_unacked() in e_end_block()
                 * respective _drbd_clear_done_ee */
-               break;
-       case DRBD_PROT_B:
+       }
+       if (dp_flags & DP_SEND_RECEIVE_ACK) {
                /* I really don't like it that the receiver thread
                 * sends on the msock, but anyways */
-               drbd_send_ack(mdev, P_RECV_ACK, e);
-               break;
-       case DRBD_PROT_A:
-               /* nothing to do */
-               break;
+               drbd_send_ack(mdev, P_RECV_ACK, peer_req);
        }
  
        if (mdev->state.pdsk < D_INCONSISTENT) {
                /* In case we have the only disk of the cluster, */
-               drbd_set_out_of_sync(mdev, e->sector, e->size);
-               e->flags |= EE_CALL_AL_COMPLETE_IO;
-               e->flags &= ~EE_MAY_SET_IN_SYNC;
-               drbd_al_begin_io(mdev, e->sector);
+               drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size);
+               peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
+               peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
+               drbd_al_begin_io(mdev, &peer_req->i);
        }
  
-       if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
-               return true;
+       err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR);
+       if (!err)
+               return 0;
  
        /* don't care for the reason here */
        dev_err(DEV, "submit failed, triggering re-connect\n");
-       spin_lock_irq(&mdev->req_lock);
-       list_del(&e->w.list);
-       hlist_del_init(&e->collision);
-       spin_unlock_irq(&mdev->req_lock);
-       if (e->flags & EE_CALL_AL_COMPLETE_IO)
-               drbd_al_complete_io(mdev, e->sector);
+       spin_lock_irq(&mdev->tconn->req_lock);
+       list_del(&peer_req->w.list);
+       drbd_remove_epoch_entry_interval(mdev, peer_req);
+       spin_unlock_irq(&mdev->tconn->req_lock);
+       if (peer_req->flags & EE_CALL_AL_COMPLETE_IO)
+               drbd_al_complete_io(mdev, &peer_req->i);
  
  out_interrupted:
-       drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + EV_CLEANUP);
+       drbd_may_finish_epoch(tconn, peer_req->epoch, EV_PUT + EV_CLEANUP);
        put_ldev(mdev);
-       drbd_free_ee(mdev, e);
-       return false;
+       drbd_free_peer_req(mdev, peer_req);
+       return err;
  }
  
  /* We may throttle resync, if the lower device seems to be busy,
@@@ -1922,9 -2296,14 +2296,14 @@@ int drbd_rs_should_slow_down(struct drb
        struct lc_element *tmp;
        int curr_events;
        int throttle = 0;
+       unsigned int c_min_rate;
+       rcu_read_lock();
+       c_min_rate = rcu_dereference(mdev->ldev->disk_conf)->c_min_rate;
+       rcu_read_unlock();
  
        /* feature disabled? */
-       if (mdev->sync_conf.c_min_rate == 0)
+       if (c_min_rate == 0)
                return 0;
  
        spin_lock_irq(&mdev->al_lock);
                db = mdev->rs_mark_left[i] - rs_left;
                dbdt = Bit2KB(db/dt);
  
-               if (dbdt > mdev->sync_conf.c_min_rate)
+               if (dbdt > c_min_rate)
                        throttle = 1;
        }
        return throttle;
  }
  
  
- static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
+ static int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi)
  {
+       struct drbd_conf *mdev;
        sector_t sector;
-       const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
-       struct drbd_epoch_entry *e;
+       sector_t capacity;
+       struct drbd_peer_request *peer_req;
        struct digest_info *di = NULL;
        int size, verb;
        unsigned int fault_type;
-       struct p_block_req *p = &mdev->data.rbuf.block_req;
+       struct p_block_req *p = pi->data;
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return -EIO;
+       capacity = drbd_get_capacity(mdev->this_bdev);
  
        sector = be64_to_cpu(p->sector);
        size   = be32_to_cpu(p->blksize);
  
-       if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+       if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
                dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
                                (unsigned long long)sector, size);
-               return false;
+               return -EINVAL;
        }
        if (sector + (size>>9) > capacity) {
                dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
                                (unsigned long long)sector, size);
-               return false;
+               return -EINVAL;
        }
  
        if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
                verb = 1;
-               switch (cmd) {
+               switch (pi->cmd) {
                case P_DATA_REQUEST:
                        drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
                        break;
                        drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
                        break;
                default:
-                       dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
-                               cmdname(cmd));
+                       BUG();
                }
                if (verb && __ratelimit(&drbd_ratelimit_state))
                        dev_err(DEV, "Can not satisfy peer's read request, "
                            "no local data.\n");
  
                /* drain possibly payload */
-               return drbd_drain_block(mdev, digest_size);
+               return drbd_drain_block(mdev, pi->size);
        }
  
        /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
         * "criss-cross" setup, that might cause write-out on some other DRBD,
         * which in turn might block on the other node at this very place.  */
-       e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
-       if (!e) {
+       peer_req = drbd_alloc_peer_req(mdev, p->block_id, sector, size, GFP_NOIO);
+       if (!peer_req) {
                put_ldev(mdev);
-               return false;
+               return -ENOMEM;
        }
  
-       switch (cmd) {
+       switch (pi->cmd) {
        case P_DATA_REQUEST:
-               e->w.cb = w_e_end_data_req;
+               peer_req->w.cb = w_e_end_data_req;
                fault_type = DRBD_FAULT_DT_RD;
                /* application IO, don't drbd_rs_begin_io */
                goto submit;
  
        case P_RS_DATA_REQUEST:
-               e->w.cb = w_e_end_rsdata_req;
+               peer_req->w.cb = w_e_end_rsdata_req;
                fault_type = DRBD_FAULT_RS_RD;
                /* used in the sector offset progress display */
                mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
        case P_OV_REPLY:
        case P_CSUM_RS_REQUEST:
                fault_type = DRBD_FAULT_RS_RD;
-               di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
+               di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
                if (!di)
                        goto out_free_e;
  
-               di->digest_size = digest_size;
+               di->digest_size = pi->size;
                di->digest = (((char *)di)+sizeof(struct digest_info));
  
-               e->digest = di;
-               e->flags |= EE_HAS_DIGEST;
+               peer_req->digest = di;
+               peer_req->flags |= EE_HAS_DIGEST;
  
-               if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
+               if (drbd_recv_all(mdev->tconn, di->digest, pi->size))
                        goto out_free_e;
  
-               if (cmd == P_CSUM_RS_REQUEST) {
-                       D_ASSERT(mdev->agreed_pro_version >= 89);
-                       e->w.cb = w_e_end_csum_rs_req;
+               if (pi->cmd == P_CSUM_RS_REQUEST) {
+                       D_ASSERT(mdev->tconn->agreed_pro_version >= 89);
+                       peer_req->w.cb = w_e_end_csum_rs_req;
                        /* used in the sector offset progress display */
                        mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
-               } else if (cmd == P_OV_REPLY) {
+               } else if (pi->cmd == P_OV_REPLY) {
                        /* track progress, we may need to throttle */
                        atomic_add(size >> 9, &mdev->rs_sect_in);
-                       e->w.cb = w_e_end_ov_reply;
+                       peer_req->w.cb = w_e_end_ov_reply;
                        dec_rs_pending(mdev);
                        /* drbd_rs_begin_io done when we sent this request,
                         * but accounting still needs to be done. */
  
        case P_OV_REQUEST:
                if (mdev->ov_start_sector == ~(sector_t)0 &&
-                   mdev->agreed_pro_version >= 90) {
+                   mdev->tconn->agreed_pro_version >= 90) {
                        unsigned long now = jiffies;
                        int i;
                        mdev->ov_start_sector = sector;
                        dev_info(DEV, "Online Verify start sector: %llu\n",
                                        (unsigned long long)sector);
                }
-               e->w.cb = w_e_end_ov_req;
+               peer_req->w.cb = w_e_end_ov_req;
                fault_type = DRBD_FAULT_RS_RD;
                break;
  
        default:
-               dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
-                   cmdname(cmd));
-               fault_type = DRBD_FAULT_MAX;
-               goto out_free_e;
+               BUG();
        }
  
        /* Throttle, drbd_rs_begin_io and submit should become asynchronous
@@@ -2137,30 -2518,31 +2518,31 @@@ submit_for_resync
  
  submit:
        inc_unacked(mdev);
-       spin_lock_irq(&mdev->req_lock);
-       list_add_tail(&e->w.list, &mdev->read_ee);
-       spin_unlock_irq(&mdev->req_lock);
+       spin_lock_irq(&mdev->tconn->req_lock);
+       list_add_tail(&peer_req->w.list, &mdev->read_ee);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
-       if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
-               return true;
+       if (drbd_submit_peer_request(mdev, peer_req, READ, fault_type) == 0)
+               return 0;
  
        /* don't care for the reason here */
        dev_err(DEV, "submit failed, triggering re-connect\n");
-       spin_lock_irq(&mdev->req_lock);
-       list_del(&e->w.list);
-       spin_unlock_irq(&mdev->req_lock);
+       spin_lock_irq(&mdev->tconn->req_lock);
+       list_del(&peer_req->w.list);
+       spin_unlock_irq(&mdev->tconn->req_lock);
        /* no drbd_rs_complete_io(), we are dropping the connection anyways */
  
  out_free_e:
        put_ldev(mdev);
-       drbd_free_ee(mdev, e);
-       return false;
+       drbd_free_peer_req(mdev, peer_req);
+       return -EIO;
  }
  
  static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
  {
        int self, peer, rv = -100;
        unsigned long ch_self, ch_peer;
+       enum drbd_after_sb_p after_sb_0p;
  
        self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
        peer = mdev->p_uuid[UI_BITMAP] & 1;
        ch_peer = mdev->p_uuid[UI_SIZE];
        ch_self = mdev->comm_bm_set;
  
-       switch (mdev->net_conf->after_sb_0p) {
+       rcu_read_lock();
+       after_sb_0p = rcu_dereference(mdev->tconn->net_conf)->after_sb_0p;
+       rcu_read_unlock();
+       switch (after_sb_0p) {
        case ASB_CONSENSUS:
        case ASB_DISCARD_SECONDARY:
        case ASB_CALL_HELPER:
+       case ASB_VIOLENTLY:
                dev_err(DEV, "Configuration error.\n");
                break;
        case ASB_DISCONNECT:
                     "Using discard-least-changes instead\n");
        case ASB_DISCARD_ZERO_CHG:
                if (ch_peer == 0 && ch_self == 0) {
-                       rv = drbd_test_flag(mdev, DISCARD_CONCURRENT)
+                       rv = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags)
                                ? -1 : 1;
                        break;
                } else {
                        if (ch_peer == 0) { rv =  1; break; }
                        if (ch_self == 0) { rv = -1; break; }
                }
-               if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
+               if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
                        break;
        case ASB_DISCARD_LEAST_CHG:
                if      (ch_self < ch_peer)
                        rv =  1;
                else /* ( ch_self == ch_peer ) */
                     /* Well, then use something else. */
-                       rv = drbd_test_flag(mdev, DISCARD_CONCURRENT)
+                       rv = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags)
                                ? -1 : 1;
                break;
        case ASB_DISCARD_LOCAL:
  static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
  {
        int hg, rv = -100;
+       enum drbd_after_sb_p after_sb_1p;
  
-       switch (mdev->net_conf->after_sb_1p) {
+       rcu_read_lock();
+       after_sb_1p = rcu_dereference(mdev->tconn->net_conf)->after_sb_1p;
+       rcu_read_unlock();
+       switch (after_sb_1p) {
        case ASB_DISCARD_YOUNGER_PRI:
        case ASB_DISCARD_OLDER_PRI:
        case ASB_DISCARD_LEAST_CHG:
        case ASB_DISCARD_LOCAL:
        case ASB_DISCARD_REMOTE:
+       case ASB_DISCARD_ZERO_CHG:
                dev_err(DEV, "Configuration error.\n");
                break;
        case ASB_DISCONNECT:
  static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
  {
        int hg, rv = -100;
+       enum drbd_after_sb_p after_sb_2p;
  
-       switch (mdev->net_conf->after_sb_2p) {
+       rcu_read_lock();
+       after_sb_2p = rcu_dereference(mdev->tconn->net_conf)->after_sb_2p;
+       rcu_read_unlock();
+       switch (after_sb_2p) {
        case ASB_DISCARD_YOUNGER_PRI:
        case ASB_DISCARD_OLDER_PRI:
        case ASB_DISCARD_LEAST_CHG:
        case ASB_DISCARD_REMOTE:
        case ASB_CONSENSUS:
        case ASB_DISCARD_SECONDARY:
+       case ASB_DISCARD_ZERO_CHG:
                dev_err(DEV, "Configuration error.\n");
                break;
        case ASB_VIOLENTLY:
@@@ -2375,7 -2771,7 +2771,7 @@@ static int drbd_uuid_compare(struct drb
  
                if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
  
-                       if (mdev->agreed_pro_version < 91)
+                       if (mdev->tconn->agreed_pro_version < 91)
                                return -1091;
  
                        if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
  
                if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
  
-                       if (mdev->agreed_pro_version < 91)
+                       if (mdev->tconn->agreed_pro_version < 91)
                                return -1091;
  
                        if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
                }
  
                /* Common power [off|failure] */
-               rct = (drbd_test_flag(mdev, CRASHED_PRIMARY) ? 1 : 0) +
+               rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
                        (mdev->p_uuid[UI_FLAGS] & 2);
                /* lowest bit is set when we were primary,
                 * next bit (weight 2) is set when peer was primary */
                case 1: /*  self_pri && !peer_pri */ return 1;
                case 2: /* !self_pri &&  peer_pri */ return -1;
                case 3: /*  self_pri &&  peer_pri */
-                       dc = drbd_test_flag(mdev, DISCARD_CONCURRENT);
+                       dc = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags);
                        return dc ? -1 : 1;
                }
        }
        *rule_nr = 51;
        peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
        if (self == peer) {
-               if (mdev->agreed_pro_version < 96 ?
+               if (mdev->tconn->agreed_pro_version < 96 ?
                    (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
                    (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
                    peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) {
                        /* The last P_SYNC_UUID did not get though. Undo the last start of
                           resync as sync source modifications of the peer's UUIDs. */
  
-                       if (mdev->agreed_pro_version < 91)
+                       if (mdev->tconn->agreed_pro_version < 91)
                                return -1091;
  
                        mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
        *rule_nr = 71;
        self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
        if (self == peer) {
-               if (mdev->agreed_pro_version < 96 ?
+               if (mdev->tconn->agreed_pro_version < 96 ?
                    (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
                    (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
                    self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
                        /* The last P_SYNC_UUID did not get though. Undo the last start of
                           resync as sync source modifications of our UUIDs. */
  
-                       if (mdev->agreed_pro_version < 91)
+                       if (mdev->tconn->agreed_pro_version < 91)
                                return -1091;
  
                        __drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
  static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
                                           enum drbd_disk_state peer_disk) __must_hold(local)
  {
-       int hg, rule_nr;
        enum drbd_conns rv = C_MASK;
        enum drbd_disk_state mydisk;
+       struct net_conf *nc;
+       int hg, rule_nr, rr_conflict, tentative;
  
        mydisk = mdev->state.disk;
        if (mydisk == D_NEGOTIATING)
        if (abs(hg) == 100)
                drbd_khelper(mdev, "initial-split-brain");
  
-       if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
+       rcu_read_lock();
+       nc = rcu_dereference(mdev->tconn->net_conf);
+       if (hg == 100 || (hg == -100 && nc->always_asbp)) {
                int pcount = (mdev->state.role == R_PRIMARY)
                           + (peer_role == R_PRIMARY);
                int forced = (hg == -100);
        }
  
        if (hg == -100) {
-               if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
+               if (test_bit(DISCARD_MY_DATA, &mdev->flags) && !(mdev->p_uuid[UI_FLAGS]&1))
                        hg = -1;
-               if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
+               if (!test_bit(DISCARD_MY_DATA, &mdev->flags) && (mdev->p_uuid[UI_FLAGS]&1))
                        hg = 1;
  
                if (abs(hg) < 100)
                             "Sync from %s node\n",
                             (hg < 0) ? "peer" : "this");
        }
+       rr_conflict = nc->rr_conflict;
+       tentative = nc->tentative;
+       rcu_read_unlock();
  
        if (hg == -100) {
                /* FIXME this log message is not correct if we end up here
  
        if (hg < 0 && /* by intention we do not use mydisk here. */
            mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
-               switch (mdev->net_conf->rr_conflict) {
+               switch (rr_conflict) {
                case ASB_CALL_HELPER:
                        drbd_khelper(mdev, "pri-lost");
                        /* fall through */
                }
        }
  
-       if (mdev->net_conf->dry_run || drbd_test_flag(mdev, CONN_DRY_RUN)) {
+       if (tentative || test_bit(CONN_DRY_RUN, &mdev->tconn->flags)) {
                if (hg == 0)
                        dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n");
                else
        return rv;
  }
  
- /* returns 1 if invalid */
- static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
+ static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
  {
        /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
-       if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
-           (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
-               return 0;
+       if (peer == ASB_DISCARD_REMOTE)
+               return ASB_DISCARD_LOCAL;
  
        /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
-       if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
-           self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
-               return 1;
+       if (peer == ASB_DISCARD_LOCAL)
+               return ASB_DISCARD_REMOTE;
  
        /* everything else is valid if they are equal on both sides. */
-       if (peer == self)
-               return 0;
-       /* everything es is invalid. */
-       return 1;
+       return peer;
  }
  
- static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+ static int receive_protocol(struct drbd_tconn *tconn, struct packet_info *pi)
  {
-       struct p_protocol *p = &mdev->data.rbuf.protocol;
-       int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
-       int p_want_lose, p_two_primaries, cf;
-       char p_integrity_alg[SHARED_SECRET_MAX] = "";
+       struct p_protocol *p = pi->data;
+       enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
+       int p_proto, p_discard_my_data, p_two_primaries, cf;
+       struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
+       char integrity_alg[SHARED_SECRET_MAX] = "";
+       struct crypto_hash *peer_integrity_tfm = NULL;
+       void *int_dig_in = NULL, *int_dig_vv = NULL;
  
        p_proto         = be32_to_cpu(p->protocol);
        p_after_sb_0p   = be32_to_cpu(p->after_sb_0p);
        p_after_sb_2p   = be32_to_cpu(p->after_sb_2p);
        p_two_primaries = be32_to_cpu(p->two_primaries);
        cf              = be32_to_cpu(p->conn_flags);
-       p_want_lose = cf & CF_WANT_LOSE;
+       p_discard_my_data = cf & CF_DISCARD_MY_DATA;
  
-       drbd_clear_flag(mdev, CONN_DRY_RUN);
+       if (tconn->agreed_pro_version >= 87) {
+               int err;
  
-       if (cf & CF_DRY_RUN)
-               drbd_set_flag(mdev, CONN_DRY_RUN);
-       if (p_proto != mdev->net_conf->wire_protocol) {
-               dev_err(DEV, "incompatible communication protocols\n");
-               goto disconnect;
+               if (pi->size > sizeof(integrity_alg))
+                       return -EIO;
+               err = drbd_recv_all(tconn, integrity_alg, pi->size);
+               if (err)
+                       return err;
+               integrity_alg[SHARED_SECRET_MAX - 1] = 0;
        }
  
-       if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
-               dev_err(DEV, "incompatible after-sb-0pri settings\n");
-               goto disconnect;
-       }
+       if (pi->cmd != P_PROTOCOL_UPDATE) {
+               clear_bit(CONN_DRY_RUN, &tconn->flags);
  
-       if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
-               dev_err(DEV, "incompatible after-sb-1pri settings\n");
-               goto disconnect;
-       }
+               if (cf & CF_DRY_RUN)
+                       set_bit(CONN_DRY_RUN, &tconn->flags);
  
-       if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
-               dev_err(DEV, "incompatible after-sb-2pri settings\n");
-               goto disconnect;
-       }
+               rcu_read_lock();
+               nc = rcu_dereference(tconn->net_conf);
  
-       if (p_want_lose && mdev->net_conf->want_lose) {
-               dev_err(DEV, "both sides have the 'want_lose' flag set\n");
-               goto disconnect;
-       }
+               if (p_proto != nc->wire_protocol) {
+                       conn_err(tconn, "incompatible %s settings\n", "protocol");
+                       goto disconnect_rcu_unlock;
+               }
  
-       if (p_two_primaries != mdev->net_conf->two_primaries) {
-               dev_err(DEV, "incompatible setting of the two-primaries options\n");
-               goto disconnect;
+               if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
+                       conn_err(tconn, "incompatible %s settings\n", "after-sb-0pri");
+                       goto disconnect_rcu_unlock;
+               }
+               if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
+                       conn_err(tconn, "incompatible %s settings\n", "after-sb-1pri");
+                       goto disconnect_rcu_unlock;
+               }
+               if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
+                       conn_err(tconn, "incompatible %s settings\n", "after-sb-2pri");
+                       goto disconnect_rcu_unlock;
+               }
+               if (p_discard_my_data && nc->discard_my_data) {
+                       conn_err(tconn, "incompatible %s settings\n", "discard-my-data");
+                       goto disconnect_rcu_unlock;
+               }
+               if (p_two_primaries != nc->two_primaries) {
+                       conn_err(tconn, "incompatible %s settings\n", "allow-two-primaries");
+                       goto disconnect_rcu_unlock;
+               }
+               if (strcmp(integrity_alg, nc->integrity_alg)) {
+                       conn_err(tconn, "incompatible %s settings\n", "data-integrity-alg");
+                       goto disconnect_rcu_unlock;
+               }
+               rcu_read_unlock();
        }
  
-       if (mdev->agreed_pro_version >= 87) {
-               unsigned char *my_alg = mdev->net_conf->integrity_alg;
+       if (integrity_alg[0]) {
+               int hash_size;
+               /*
+                * We can only change the peer data integrity algorithm
+                * here.  Changing our own data integrity algorithm
+                * requires that we send a P_PROTOCOL_UPDATE packet at
+                * the same time; otherwise, the peer has no way to
+                * tell between which packets the algorithm should
+                * change.
+                */
  
-               if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
-                       return false;
+               peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
+               if (!peer_integrity_tfm) {
+                       conn_err(tconn, "peer data-integrity-alg %s not supported\n",
+                                integrity_alg);
+                       goto disconnect;
+               }
  
-               p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
-               if (strcmp(p_integrity_alg, my_alg)) {
-                       dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
+               hash_size = crypto_hash_digestsize(peer_integrity_tfm);
+               int_dig_in = kmalloc(hash_size, GFP_KERNEL);
+               int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
+               if (!(int_dig_in && int_dig_vv)) {
+                       conn_err(tconn, "Allocation of buffers for data integrity checking failed\n");
                        goto disconnect;
                }
-               dev_info(DEV, "data-integrity-alg: %s\n",
-                    my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
        }
  
-       return true;
+       new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
+       if (!new_net_conf) {
+               conn_err(tconn, "Allocation of new net_conf failed\n");
+               goto disconnect;
+       }
+       mutex_lock(&tconn->data.mutex);
+       mutex_lock(&tconn->conf_update);
+       old_net_conf = tconn->net_conf;
+       *new_net_conf = *old_net_conf;
+       new_net_conf->wire_protocol = p_proto;
+       new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
+       new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
+       new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
+       new_net_conf->two_primaries = p_two_primaries;
+       rcu_assign_pointer(tconn->net_conf, new_net_conf);
+       mutex_unlock(&tconn->conf_update);
+       mutex_unlock(&tconn->data.mutex);
+       crypto_free_hash(tconn->peer_integrity_tfm);
+       kfree(tconn->int_dig_in);
+       kfree(tconn->int_dig_vv);
+       tconn->peer_integrity_tfm = peer_integrity_tfm;
+       tconn->int_dig_in = int_dig_in;
+       tconn->int_dig_vv = int_dig_vv;
+       if (strcmp(old_net_conf->integrity_alg, integrity_alg))
+               conn_info(tconn, "peer data-integrity-alg: %s\n",
+                         integrity_alg[0] ? integrity_alg : "(none)");
+       synchronize_rcu();
+       kfree(old_net_conf);
+       return 0;
  
+ disconnect_rcu_unlock:
+       rcu_read_unlock();
  disconnect:
-       drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-       return false;
+       crypto_free_hash(peer_integrity_tfm);
+       kfree(int_dig_in);
+       kfree(int_dig_vv);
+       conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+       return -EIO;
  }
  
  /* helper function
@@@ -2792,24 -3266,64 +3266,64 @@@ struct crypto_hash *drbd_crypto_alloc_d
                        alg, name, PTR_ERR(tfm));
                return tfm;
        }
-       if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
-               crypto_free_hash(tfm);
-               dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
-               return ERR_PTR(-EINVAL);
-       }
        return tfm;
  }
  
- static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
+ static int ignore_remaining_packet(struct drbd_tconn *tconn, struct packet_info *pi)
+ {
+       void *buffer = tconn->data.rbuf;
+       int size = pi->size;
+       while (size) {
+               int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
+               s = drbd_recv(tconn, buffer, s);
+               if (s <= 0) {
+                       if (s < 0)
+                               return s;
+                       break;
+               }
+               size -= s;
+       }
+       if (size)
+               return -EIO;
+       return 0;
+ }
+ /*
+  * config_unknown_volume  -  device configuration command for unknown volume
+  *
+  * When a device is added to an existing connection, the node on which the
+  * device is added first will send configuration commands to its peer but the
+  * peer will not know about the device yet.  It will warn and ignore these
+  * commands.  Once the device is added on the second node, the second node will
+  * send the same device configuration commands, but in the other direction.
+  *
+  * (We can also end up here if drbd is misconfigured.)
+  */
+ static int config_unknown_volume(struct drbd_tconn *tconn, struct packet_info *pi)
+ {
+       conn_warn(tconn, "%s packet received for volume %u, which is not configured locally\n",
+                 cmdname(pi->cmd), pi->vnr);
+       return ignore_remaining_packet(tconn, pi);
+ }
+ static int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi)
  {
-       int ok = true;
-       struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
+       struct drbd_conf *mdev;
+       struct p_rs_param_95 *p;
        unsigned int header_size, data_size, exp_max_sz;
        struct crypto_hash *verify_tfm = NULL;
        struct crypto_hash *csums_tfm = NULL;
-       const int apv = mdev->agreed_pro_version;
-       int *rs_plan_s = NULL;
+       struct net_conf *old_net_conf, *new_net_conf = NULL;
+       struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
+       const int apv = tconn->agreed_pro_version;
+       struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
        int fifo_size = 0;
+       int err;
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return config_unknown_volume(tconn, pi);
  
        exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
                    : apv == 88 ? sizeof(struct p_rs_param)
                    : apv <= 94 ? sizeof(struct p_rs_param_89)
                    : /* apv >= 95 */ sizeof(struct p_rs_param_95);
  
-       if (packet_size > exp_max_sz) {
+       if (pi->size > exp_max_sz) {
                dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
-                   packet_size, exp_max_sz);
-               return false;
+                   pi->size, exp_max_sz);
+               return -EIO;
        }
  
        if (apv <= 88) {
-               header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
-               data_size   = packet_size  - header_size;
+               header_size = sizeof(struct p_rs_param);
+               data_size = pi->size - header_size;
        } else if (apv <= 94) {
-               header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
-               data_size   = packet_size  - header_size;
+               header_size = sizeof(struct p_rs_param_89);
+               data_size = pi->size - header_size;
                D_ASSERT(data_size == 0);
        } else {
-               header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
-               data_size   = packet_size  - header_size;
+               header_size = sizeof(struct p_rs_param_95);
+               data_size = pi->size - header_size;
                D_ASSERT(data_size == 0);
        }
  
        /* initialize verify_alg and csums_alg */
+       p = pi->data;
        memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
  
-       if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
-               return false;
+       err = drbd_recv_all(mdev->tconn, p, header_size);
+       if (err)
+               return err;
+       mutex_lock(&mdev->tconn->conf_update);
+       old_net_conf = mdev->tconn->net_conf;
+       if (get_ldev(mdev)) {
+               new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+               if (!new_disk_conf) {
+                       put_ldev(mdev);
+                       mutex_unlock(&mdev->tconn->conf_update);
+                       dev_err(DEV, "Allocation of new disk_conf failed\n");
+                       return -ENOMEM;
+               }
+               old_disk_conf = mdev->ldev->disk_conf;
+               *new_disk_conf = *old_disk_conf;
  
-       mdev->sync_conf.rate      = be32_to_cpu(p->rate);
+               new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
+       }
  
        if (apv >= 88) {
                if (apv == 88) {
                                dev_err(DEV, "verify-alg of wrong size, "
                                        "peer wants %u, accepting only up to %u byte\n",
                                        data_size, SHARED_SECRET_MAX);
-                               return false;
+                               err = -EIO;
+                               goto reconnect;
                        }
  
-                       if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
-                               return false;
+                       err = drbd_recv_all(mdev->tconn, p->verify_alg, data_size);
+                       if (err)
+                               goto reconnect;
                        /* we expect NUL terminated string */
                        /* but just in case someone tries to be evil */
                        D_ASSERT(p->verify_alg[data_size-1] == 0);
                        p->csums_alg[SHARED_SECRET_MAX-1] = 0;
                }
  
-               if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
+               if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
                        if (mdev->state.conn == C_WF_REPORT_PARAMS) {
                                dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
-                                   mdev->sync_conf.verify_alg, p->verify_alg);
+                                   old_net_conf->verify_alg, p->verify_alg);
                                goto disconnect;
                        }
                        verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
                        }
                }
  
-               if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
+               if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
                        if (mdev->state.conn == C_WF_REPORT_PARAMS) {
                                dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
-                                   mdev->sync_conf.csums_alg, p->csums_alg);
+                                   old_net_conf->csums_alg, p->csums_alg);
                                goto disconnect;
                        }
                        csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
                        }
                }
  
-               if (apv > 94) {
-                       mdev->sync_conf.rate      = be32_to_cpu(p->rate);
-                       mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
-                       mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
-                       mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
-                       mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
-                       fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
-                       if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
-                               rs_plan_s   = kzalloc(sizeof(int) * fifo_size, GFP_NOIO);
-                               if (!rs_plan_s) {
+               if (apv > 94 && new_disk_conf) {
+                       new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
+                       new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
+                       new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
+                       new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
+                       fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+                       if (fifo_size != mdev->rs_plan_s->size) {
+                               new_plan = fifo_alloc(fifo_size);
+                               if (!new_plan) {
                                        dev_err(DEV, "kmalloc of fifo_buffer failed");
+                                       put_ldev(mdev);
                                        goto disconnect;
                                }
                        }
                }
  
-               spin_lock(&mdev->peer_seq_lock);
-               /* lock against drbd_nl_syncer_conf() */
-               if (verify_tfm) {
-                       strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
-                       mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
-                       crypto_free_hash(mdev->verify_tfm);
-                       mdev->verify_tfm = verify_tfm;
-                       dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
-               }
-               if (csums_tfm) {
-                       strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
-                       mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
-                       crypto_free_hash(mdev->csums_tfm);
-                       mdev->csums_tfm = csums_tfm;
-                       dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
-               }
-               if (fifo_size != mdev->rs_plan_s.size) {
-                       kfree(mdev->rs_plan_s.values);
-                       mdev->rs_plan_s.values = rs_plan_s;
-                       mdev->rs_plan_s.size   = fifo_size;
-                       mdev->rs_planed = 0;
+               if (verify_tfm || csums_tfm) {
+                       new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+                       if (!new_net_conf) {
+                               dev_err(DEV, "Allocation of new net_conf failed\n");
+                               goto disconnect;
+                       }
+                       *new_net_conf = *old_net_conf;
+                       if (verify_tfm) {
+                               strcpy(new_net_conf->verify_alg, p->verify_alg);
+                               new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
+                               crypto_free_hash(mdev->tconn->verify_tfm);
+                               mdev->tconn->verify_tfm = verify_tfm;
+                               dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
+                       }
+                       if (csums_tfm) {
+                               strcpy(new_net_conf->csums_alg, p->csums_alg);
+                               new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
+                               crypto_free_hash(mdev->tconn->csums_tfm);
+                               mdev->tconn->csums_tfm = csums_tfm;
+                               dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
+                       }
+                       rcu_assign_pointer(tconn->net_conf, new_net_conf);
                }
-               spin_unlock(&mdev->peer_seq_lock);
        }
  
-       return ok;
+       if (new_disk_conf) {
+               rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
+               put_ldev(mdev);
+       }
+       if (new_plan) {
+               old_plan = mdev->rs_plan_s;
+               rcu_assign_pointer(mdev->rs_plan_s, new_plan);
+       }
+       mutex_unlock(&mdev->tconn->conf_update);
+       synchronize_rcu();
+       if (new_net_conf)
+               kfree(old_net_conf);
+       kfree(old_disk_conf);
+       kfree(old_plan);
+       return 0;
+ reconnect:
+       if (new_disk_conf) {
+               put_ldev(mdev);
+               kfree(new_disk_conf);
+       }
+       mutex_unlock(&mdev->tconn->conf_update);
+       return -EIO;
  disconnect:
+       kfree(new_plan);
+       if (new_disk_conf) {
+               put_ldev(mdev);
+               kfree(new_disk_conf);
+       }
+       mutex_unlock(&mdev->tconn->conf_update);
        /* just for completeness: actually not needed,
         * as this is not reached if csums_tfm was ok. */
        crypto_free_hash(csums_tfm);
        /* but free the verify_tfm again, if csums_tfm did not work out */
        crypto_free_hash(verify_tfm);
-       drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-       return false;
+       conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+       return -EIO;
  }
  
  /* warn if the arguments differ by more than 12.5% */
@@@ -2964,59 -3530,77 +3530,77 @@@ static void warn_if_differ_considerably
                     (unsigned long long)a, (unsigned long long)b);
  }
  
- static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
  {
-       struct p_sizes *p = &mdev->data.rbuf.sizes;
+       struct drbd_conf *mdev;
+       struct p_sizes *p = pi->data;
        enum determine_dev_size dd = unchanged;
        sector_t p_size, p_usize, my_usize;
        int ldsc = 0; /* local disk size changed */
        enum dds_flags ddsf;
  
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return config_unknown_volume(tconn, pi);
        p_size = be64_to_cpu(p->d_size);
        p_usize = be64_to_cpu(p->u_size);
  
-       if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
-               dev_err(DEV, "some backing storage is needed\n");
-               drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-               return false;
-       }
        /* just store the peer's disk size for now.
         * we still need to figure out whether we accept that. */
        mdev->p_size = p_size;
  
        if (get_ldev(mdev)) {
+               rcu_read_lock();
+               my_usize = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+               rcu_read_unlock();
                warn_if_differ_considerably(mdev, "lower level device sizes",
                           p_size, drbd_get_max_capacity(mdev->ldev));
                warn_if_differ_considerably(mdev, "user requested size",
-                                           p_usize, mdev->ldev->dc.disk_size);
+                                           p_usize, my_usize);
  
                /* if this is the first connect, or an otherwise expected
                 * param exchange, choose the minimum */
                if (mdev->state.conn == C_WF_REPORT_PARAMS)
-                       p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
-                                            p_usize);
-               my_usize = mdev->ldev->dc.disk_size;
-               if (mdev->ldev->dc.disk_size != p_usize) {
-                       mdev->ldev->dc.disk_size = p_usize;
-                       dev_info(DEV, "Peer sets u_size to %lu sectors\n",
-                            (unsigned long)mdev->ldev->dc.disk_size);
-               }
+                       p_usize = min_not_zero(my_usize, p_usize);
  
                /* Never shrink a device with usable data during connect.
                   But allow online shrinking if we are connected. */
-               if (drbd_new_dev_size(mdev, mdev->ldev, 0) <
-                  drbd_get_capacity(mdev->this_bdev) &&
-                  mdev->state.disk >= D_OUTDATED &&
-                  mdev->state.conn < C_CONNECTED) {
+               if (drbd_new_dev_size(mdev, mdev->ldev, p_usize, 0) <
+                   drbd_get_capacity(mdev->this_bdev) &&
+                   mdev->state.disk >= D_OUTDATED &&
+                   mdev->state.conn < C_CONNECTED) {
                        dev_err(DEV, "The peer's disk size is too small!\n");
-                       drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-                       mdev->ldev->dc.disk_size = my_usize;
+                       conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
                        put_ldev(mdev);
-                       return false;
+                       return -EIO;
+               }
+               if (my_usize != p_usize) {
+                       struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+                       new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+                       if (!new_disk_conf) {
+                               dev_err(DEV, "Allocation of new disk_conf failed\n");
+                               put_ldev(mdev);
+                               return -ENOMEM;
+                       }
+                       mutex_lock(&mdev->tconn->conf_update);
+                       old_disk_conf = mdev->ldev->disk_conf;
+                       *new_disk_conf = *old_disk_conf;
+                       new_disk_conf->disk_size = p_usize;
+                       rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
+                       mutex_unlock(&mdev->tconn->conf_update);
+                       synchronize_rcu();
+                       kfree(old_disk_conf);
+                       dev_info(DEV, "Peer sets u_size to %lu sectors\n",
+                                (unsigned long)my_usize);
                }
                put_ldev(mdev);
        }
  
                dd = drbd_determine_dev_size(mdev, ddsf);
                put_ldev(mdev);
                if (dd == dev_size_error)
-                       return false;
+                       return -EIO;
                drbd_md_sync(mdev);
        } else {
                /* I am diskless, need to accept the peer's size. */
                         * needs to know my new size... */
                        drbd_send_sizes(mdev, 0, ddsf);
                }
-               if (drbd_test_and_clear_flag(mdev, RESIZE_PENDING) ||
+               if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
                    (dd == grew && mdev->state.conn == C_CONNECTED)) {
                        if (mdev->state.pdsk >= D_INCONSISTENT &&
                            mdev->state.disk >= D_INCONSISTENT) {
                                else
                                        resync_after_online_grow(mdev);
                        } else
-                               drbd_set_flag(mdev, RESYNC_AFTER_NEG);
+                               set_bit(RESYNC_AFTER_NEG, &mdev->flags);
                }
        }
  
-       return true;
+       return 0;
  }
  
- static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+ static int receive_uuids(struct drbd_tconn *tconn, struct packet_info *pi)
  {
-       struct p_uuids *p = &mdev->data.rbuf.uuids;
+       struct drbd_conf *mdev;
+       struct p_uuids *p = pi->data;
        u64 *p_uuid;
        int i, updated_uuids = 0;
  
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return config_unknown_volume(tconn, pi);
        p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
  
        for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
            (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
                dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
                    (unsigned long long)mdev->ed_uuid);
-               drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-               return false;
+               conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+               return -EIO;
        }
  
        if (get_ldev(mdev)) {
                int skip_initial_sync =
                        mdev->state.conn == C_CONNECTED &&
-                       mdev->agreed_pro_version >= 90 &&
+                       mdev->tconn->agreed_pro_version >= 90 &&
                        mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
                        (p_uuid[UI_FLAGS] & 8);
                if (skip_initial_sync) {
           ongoing cluster wide state change is finished. That is important if
           we are primary and are detaching from our disk. We need to see the
           new disk state... */
-       wait_event(mdev->misc_wait, !drbd_test_flag(mdev, CLUSTER_ST_CHANGE));
+       mutex_lock(mdev->state_mutex);
+       mutex_unlock(mdev->state_mutex);
        if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
                updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
  
        if (updated_uuids)
                drbd_print_uuids(mdev, "receiver updated UUIDs to");
  
-       return true;
+       return 0;
  }
  
  /**
@@@ -3140,6 -3730,7 +3730,7 @@@ static union drbd_state convert_state(u
        union drbd_state ms;
  
        static enum drbd_conns c_tab[] = {
+               [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
                [C_CONNECTED] = C_CONNECTED,
  
                [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
        ms.disk = ps.pdsk;
        ms.peer_isp = (ps.aftr_isp | ps.user_isp);
  
-       return ms;
+       return ms;
+ }
+ static int receive_req_state(struct drbd_tconn *tconn, struct packet_info *pi)
+ {
+       struct drbd_conf *mdev;
+       struct p_req_state *p = pi->data;
+       union drbd_state mask, val;
+       enum drbd_state_rv rv;
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return -EIO;
+       mask.i = be32_to_cpu(p->mask);
+       val.i = be32_to_cpu(p->val);
+       if (test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags) &&
+           mutex_is_locked(mdev->state_mutex)) {
+               drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
+               return 0;
+       }
+       mask = convert_state(mask);
+       val = convert_state(val);
+       rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
+       drbd_send_sr_reply(mdev, rv);
+       drbd_md_sync(mdev);
+       return 0;
  }
  
- static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+ static int receive_req_conn_state(struct drbd_tconn *tconn, struct packet_info *pi)
  {
-       struct p_req_state *p = &mdev->data.rbuf.req_state;
+       struct p_req_state *p = pi->data;
        union drbd_state mask, val;
        enum drbd_state_rv rv;
  
        mask.i = be32_to_cpu(p->mask);
        val.i = be32_to_cpu(p->val);
  
-       if (drbd_test_flag(mdev, DISCARD_CONCURRENT) &&
-           drbd_test_flag(mdev, CLUSTER_ST_CHANGE)) {
-               drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
-               return true;
+       if (test_bit(RESOLVE_CONFLICTS, &tconn->flags) &&
+           mutex_is_locked(&tconn->cstate_mutex)) {
+               conn_send_sr_reply(tconn, SS_CONCURRENT_ST_CHG);
+               return 0;
        }
  
        mask = convert_state(mask);
        val = convert_state(val);
  
-       rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
-       drbd_send_sr_reply(mdev, rv);
-       drbd_md_sync(mdev);
+       rv = conn_request_state(tconn, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
+       conn_send_sr_reply(tconn, rv);
  
-       return true;
+       return 0;
  }
  
- static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+ static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi)
  {
-       struct p_state *p = &mdev->data.rbuf.state;
+       struct drbd_conf *mdev;
+       struct p_state *p = pi->data;
        union drbd_state os, ns, peer_state;
        enum drbd_disk_state real_peer_disk;
        enum chg_state_flags cs_flags;
        int rv;
  
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return config_unknown_volume(tconn, pi);
        peer_state.i = be32_to_cpu(p->state);
  
        real_peer_disk = peer_state.disk;
                dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
        }
  
-       spin_lock_irq(&mdev->req_lock);
+       spin_lock_irq(&mdev->tconn->req_lock);
   retry:
-       os = ns = mdev->state;
-       spin_unlock_irq(&mdev->req_lock);
+       os = ns = drbd_read_state(mdev);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
        /* If some other part of the code (asender thread, timeout)
         * already decided to close the connection again,
         * we must not "re-establish" it here. */
        if (os.conn <= C_TEAR_DOWN)
-               return false;
+               return -ECONNRESET;
  
        /* If this is the "end of sync" confirmation, usually the peer disk
         * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
                         peer_state.conn == C_CONNECTED) {
                        if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
                                drbd_resync_finished(mdev);
-                       return true;
+                       return 0;
                }
        }
  
        /* explicit verify finished notification, stop sector reached. */
        if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
            peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
-               ov_oos_print(mdev);
+               ov_out_of_sync_print(mdev);
                drbd_resync_finished(mdev);
-               return true;
+               return 0;
        }
  
        /* peer says his disk is inconsistent, while we think it is uptodate,
                        os.disk == D_NEGOTIATING));
                /* if we have both been inconsistent, and the peer has been
                 * forced to be UpToDate with --overwrite-data */
-               cr |= drbd_test_flag(mdev, CONSIDER_RESYNC);
+               cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
                /* if we had been plain connected, and the admin requested to
                 * start a sync by "invalidate" or "invalidate-remote" */
                cr |= (os.conn == C_CONNECTED &&
                                peer_state.disk = D_DISKLESS;
                                real_peer_disk = D_DISKLESS;
                        } else {
-                               if (drbd_test_and_clear_flag(mdev, CONN_DRY_RUN))
-                                       return false;
+                               if (test_and_clear_bit(CONN_DRY_RUN, &mdev->tconn->flags))
+                                       return -EIO;
                                D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
-                               drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-                               return false;
+                               conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+                               return -EIO;
                        }
                }
        }
  
-       spin_lock_irq(&mdev->req_lock);
-       if (mdev->state.i != os.i)
+       spin_lock_irq(&mdev->tconn->req_lock);
+       if (os.i != drbd_read_state(mdev).i)
                goto retry;
-       drbd_clear_flag(mdev, CONSIDER_RESYNC);
+       clear_bit(CONSIDER_RESYNC, &mdev->flags);
        ns.peer = peer_state.role;
        ns.pdsk = real_peer_disk;
        ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
        if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
                ns.disk = mdev->new_state_tmp.disk;
        cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
-       if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
-           drbd_test_flag(mdev, NEW_CUR_UUID)) {
-               /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
+       if (ns.pdsk == D_CONSISTENT && drbd_suspended(mdev) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
+           test_bit(NEW_CUR_UUID, &mdev->flags)) {
+               /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
                   for temporal network outages! */
-               spin_unlock_irq(&mdev->req_lock);
+               spin_unlock_irq(&mdev->tconn->req_lock);
                dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
-               tl_clear(mdev);
+               tl_clear(mdev->tconn);
                drbd_uuid_new_current(mdev);
-               drbd_clear_flag(mdev, NEW_CUR_UUID);
-               drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
-               return false;
+               clear_bit(NEW_CUR_UUID, &mdev->flags);
+               conn_request_state(mdev->tconn, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
+               return -EIO;
        }
        rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
-       ns = mdev->state;
-       spin_unlock_irq(&mdev->req_lock);
+       ns = drbd_read_state(mdev);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
        if (rv < SS_SUCCESS) {
-               drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-               return false;
+               conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+               return -EIO;
        }
  
        if (os.conn > C_WF_REPORT_PARAMS) {
                }
        }
  
-       mdev->net_conf->want_lose = 0;
+       clear_bit(DISCARD_MY_DATA, &mdev->flags);
  
        drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
  
-       return true;
+       return 0;
  }
  
- static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+ static int receive_sync_uuid(struct drbd_tconn *tconn, struct packet_info *pi)
  {
-       struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
+       struct drbd_conf *mdev;
+       struct p_rs_uuid *p = pi->data;
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return -EIO;
  
        wait_event(mdev->misc_wait,
                   mdev->state.conn == C_WF_SYNC_UUID ||
        } else
                dev_err(DEV, "Ignoring SyncUUID packet!\n");
  
-       return true;
+       return 0;
  }
  
  /**
   * code upon failure.
   */
  static int
- receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
-                    unsigned long *buffer, struct bm_xfer_ctx *c)
+ receive_bitmap_plain(struct drbd_conf *mdev, unsigned int size,
+                    unsigned long *p, struct bm_xfer_ctx *c)
  {
-       unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
-       unsigned want = num_words * sizeof(long);
+       unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
+                                drbd_header_size(mdev->tconn);
+       unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
+                                      c->bm_words - c->word_offset);
+       unsigned int want = num_words * sizeof(*p);
        int err;
  
-       if (want != data_size) {
-               dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
+       if (want != size) {
+               dev_err(DEV, "%s:want (%u) != size (%u)\n", __func__, want, size);
                return -EIO;
        }
        if (want == 0)
                return 0;
-       err = drbd_recv(mdev, buffer, want);
-       if (err != want) {
-               if (err >= 0)
-                       err = -EIO;
+       err = drbd_recv_all(mdev->tconn, p, want);
+       if (err)
                return err;
-       }
  
-       drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
+       drbd_bm_merge_lel(mdev, c->word_offset, num_words, p);
  
        c->word_offset += num_words;
        c->bit_offset = c->word_offset * BITS_PER_LONG;
        return 1;
  }
  
+ static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
+ {
+       return (enum drbd_bitmap_code)(p->encoding & 0x0f);
+ }
+ static int dcbp_get_start(struct p_compressed_bm *p)
+ {
+       return (p->encoding & 0x80) != 0;
+ }
+ static int dcbp_get_pad_bits(struct p_compressed_bm *p)
+ {
+       return (p->encoding >> 4) & 0x7;
+ }
  /**
   * recv_bm_rle_bits
   *
  static int
  recv_bm_rle_bits(struct drbd_conf *mdev,
                struct p_compressed_bm *p,
-               struct bm_xfer_ctx *c)
+                struct bm_xfer_ctx *c,
+                unsigned int len)
  {
        struct bitstream bs;
        u64 look_ahead;
        u64 tmp;
        unsigned long s = c->bit_offset;
        unsigned long e;
-       int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
-       int toggle = DCBP_get_start(p);
+       int toggle = dcbp_get_start(p);
        int have;
        int bits;
  
-       bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
+       bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
  
        bits = bitstream_get_bits(&bs, &look_ahead, 64);
        if (bits < 0)
  static int
  decode_bitmap_c(struct drbd_conf *mdev,
                struct p_compressed_bm *p,
-               struct bm_xfer_ctx *c)
+               struct bm_xfer_ctx *c,
+               unsigned int len)
  {
-       if (DCBP_get_code(p) == RLE_VLI_Bits)
-               return recv_bm_rle_bits(mdev, p, c);
+       if (dcbp_get_code(p) == RLE_VLI_Bits)
+               return recv_bm_rle_bits(mdev, p, c, len - sizeof(*p));
  
        /* other variants had been implemented for evaluation,
         * but have been dropped as this one turned out to be "best"
         * during all our tests. */
  
        dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
-       drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+       conn_request_state(mdev->tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
        return -EIO;
  }
  
@@@ -3515,11 -4161,13 +4161,13 @@@ void INFO_bm_xfer_stats(struct drbd_con
                const char *direction, struct bm_xfer_ctx *c)
  {
        /* what would it take to transfer it "plaintext" */
-       unsigned plain = sizeof(struct p_header80) *
-               ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
-               + c->bm_words * sizeof(long);
-       unsigned total = c->bytes[0] + c->bytes[1];
-       unsigned r;
+       unsigned int header_size = drbd_header_size(mdev->tconn);
+       unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+       unsigned int plain =
+               header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
+               c->bm_words * sizeof(unsigned long);
+       unsigned int total = c->bytes[0] + c->bytes[1];
+       unsigned int r;
  
        /* total can not be zero. but just in case: */
        if (total == 0)
     in order to be agnostic to the 32 vs 64 bits issue.
  
     returns 0 on failure, 1 if we successfully received it. */
- static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+ static int receive_bitmap(struct drbd_tconn *tconn, struct packet_info *pi)
  {
+       struct drbd_conf *mdev;
        struct bm_xfer_ctx c;
-       void *buffer;
        int err;
-       int ok = false;
-       struct p_header80 *h = &mdev->data.rbuf.header.h80;
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return -EIO;
  
        drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED);
        /* you are supposed to send additional out-of-sync information
         * if you actually set bits during this phase */
  
-       /* maybe we should use some per thread scratch page,
-        * and allocate that during initial device creation? */
-       buffer   = (unsigned long *) __get_free_page(GFP_NOIO);
-       if (!buffer) {
-               dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
-               goto out;
-       }
        c = (struct bm_xfer_ctx) {
                .bm_bits = drbd_bm_bits(mdev),
                .bm_words = drbd_bm_words(mdev),
        };
  
        for(;;) {
-               if (cmd == P_BITMAP) {
-                       err = receive_bitmap_plain(mdev, data_size, buffer, &c);
-               } else if (cmd == P_COMPRESSED_BITMAP) {
+               if (pi->cmd == P_BITMAP)
+                       err = receive_bitmap_plain(mdev, pi->size, pi->data, &c);
+               else if (pi->cmd == P_COMPRESSED_BITMAP) {
                        /* MAYBE: sanity check that we speak proto >= 90,
                         * and the feature is enabled! */
-                       struct p_compressed_bm *p;
+                       struct p_compressed_bm *p = pi->data;
  
-                       if (data_size > BM_PACKET_PAYLOAD_BYTES) {
+                       if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(tconn)) {
                                dev_err(DEV, "ReportCBitmap packet too large\n");
+                               err = -EIO;
                                goto out;
                        }
-                       /* use the page buff */
-                       p = buffer;
-                       memcpy(p, h, sizeof(*h));
-                       if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
-                               goto out;
-                       if (data_size <= (sizeof(*p) - sizeof(p->head))) {
-                               dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
+                       if (pi->size <= sizeof(*p)) {
+                               dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", pi->size);
+                               err = -EIO;
                                goto out;
                        }
-                       err = decode_bitmap_c(mdev, p, &c);
+                       err = drbd_recv_all(mdev->tconn, p, pi->size);
+                       if (err)
+                              goto out;
+                       err = decode_bitmap_c(mdev, p, &c, pi->size);
                } else {
-                       dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
+                       dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
+                       err = -EIO;
                        goto out;
                }
  
-               c.packets[cmd == P_BITMAP]++;
-               c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
+               c.packets[pi->cmd == P_BITMAP]++;
+               c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(tconn) + pi->size;
  
                if (err <= 0) {
                        if (err < 0)
                                goto out;
                        break;
                }
-               if (!drbd_recv_header(mdev, &cmd, &data_size))
+               err = drbd_recv_header(mdev->tconn, pi);
+               if (err)
                        goto out;
        }
  
        if (mdev->state.conn == C_WF_BITMAP_T) {
                enum drbd_state_rv rv;
  
-               ok = !drbd_send_bitmap(mdev);
-               if (!ok)
+               err = drbd_send_bitmap(mdev);
+               if (err)
                        goto out;
                /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
                rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
                dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
                    drbd_conn_str(mdev->state.conn));
        }
+       err = 0;
  
-       ok = true;
   out:
        drbd_bm_unlock(mdev);
-       if (ok && mdev->state.conn == C_WF_BITMAP_S)
+       if (!err && mdev->state.conn == C_WF_BITMAP_S)
                drbd_start_resync(mdev, C_SYNC_SOURCE);
-       free_page((unsigned long) buffer);
-       return ok;
+       return err;
  }
  
- static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+ static int receive_skip(struct drbd_tconn *tconn, struct packet_info *pi)
  {
-       /* TODO zero copy sink :) */
-       static char sink[128];
-       int size, want, r;
-       dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
-                cmd, data_size);
+       conn_warn(tconn, "skipping unknown optional packet type %d, l: %d!\n",
+                pi->cmd, pi->size);
  
-       size = data_size;
-       while (size > 0) {
-               want = min_t(int, size, sizeof(sink));
-               r = drbd_recv(mdev, sink, want);
-               ERR_IF(r <= 0) break;
-               size -= r;
-       }
-       return size == 0;
+       return ignore_remaining_packet(tconn, pi);
  }
  
- static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+ static int receive_UnplugRemote(struct drbd_tconn *tconn, struct packet_info *pi)
  {
        /* Make sure we've acked all the TCP data associated
         * with the data requests being unplugged */
-       drbd_tcp_quickack(mdev->data.socket);
+       drbd_tcp_quickack(tconn->data.socket);
  
-       return true;
+       return 0;
  }
  
- static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+ static int receive_out_of_sync(struct drbd_tconn *tconn, struct packet_info *pi)
  {
-       struct p_block_desc *p = &mdev->data.rbuf.block_desc;
+       struct drbd_conf *mdev;
+       struct p_block_desc *p = pi->data;
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return -EIO;
  
        switch (mdev->state.conn) {
        case C_WF_SYNC_UUID:
  
        drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
  
-       return true;
+       return 0;
  }
  
- typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
  struct data_cmd {
        int expect_payload;
        size_t pkt_size;
-       drbd_cmd_handler_f function;
+       int (*fn)(struct drbd_tconn *, struct packet_info *);
  };
  
  static struct data_cmd drbd_cmd_handler[] = {
        [P_DATA_REPLY]      = { 1, sizeof(struct p_data), receive_DataReply },
        [P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
        [P_BARRIER]         = { 0, sizeof(struct p_barrier), receive_Barrier } ,
-       [P_BITMAP]          = { 1, sizeof(struct p_header80), receive_bitmap } ,
-       [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
-       [P_UNPLUG_REMOTE]   = { 0, sizeof(struct p_header80), receive_UnplugRemote },
+       [P_BITMAP]          = { 1, 0, receive_bitmap } ,
+       [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
+       [P_UNPLUG_REMOTE]   = { 0, 0, receive_UnplugRemote },
        [P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
        [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
-       [P_SYNC_PARAM]      = { 1, sizeof(struct p_header80), receive_SyncParam },
-       [P_SYNC_PARAM89]    = { 1, sizeof(struct p_header80), receive_SyncParam },
+       [P_SYNC_PARAM]      = { 1, 0, receive_SyncParam },
+       [P_SYNC_PARAM89]    = { 1, 0, receive_SyncParam },
        [P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
        [P_UUIDS]           = { 0, sizeof(struct p_uuids), receive_uuids },
        [P_SIZES]           = { 0, sizeof(struct p_sizes), receive_sizes },
        [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
        [P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
        [P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
-       /* anything missing from this table is in
-        * the asender_tbl, see get_asender_cmd */
-       [P_MAX_CMD]         = { 0, 0, NULL },
+       [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
+       [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
  };
  
- /* All handler functions that expect a sub-header get that sub-heder in
-    mdev->data.rbuf.header.head.payload.
-    Usually in mdev->data.rbuf.header.head the callback can find the usual
-    p_header, but they may not rely on that. Since there is also p_header95 !
-  */
- static void drbdd(struct drbd_conf *mdev)
+ static void drbdd(struct drbd_tconn *tconn)
  {
-       union p_header *header = &mdev->data.rbuf.header;
-       unsigned int packet_size;
-       enum drbd_packets cmd;
+       struct packet_info pi;
        size_t shs; /* sub header size */
-       int rv;
+       int err;
+       while (get_t_state(&tconn->receiver) == RUNNING) {
+               struct data_cmd *cmd;
  
-       while (get_t_state(&mdev->receiver) == Running) {
-               drbd_thread_current_set_cpu(mdev);
-               if (!drbd_recv_header(mdev, &cmd, &packet_size))
+               drbd_thread_current_set_cpu(&tconn->receiver);
+               if (drbd_recv_header(tconn, &pi))
                        goto err_out;
  
-               if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
-                       dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
+               cmd = &drbd_cmd_handler[pi.cmd];
+               if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
+                       conn_err(tconn, "Unexpected data packet %s (0x%04x)",
+                                cmdname(pi.cmd), pi.cmd);
                        goto err_out;
                }
  
-               shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
-               if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
-                       dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
+               shs = cmd->pkt_size;
+               if (pi.size > shs && !cmd->expect_payload) {
+                       conn_err(tconn, "No payload expected %s l:%d\n",
+                                cmdname(pi.cmd), pi.size);
                        goto err_out;
                }
  
                if (shs) {
-                       rv = drbd_recv(mdev, &header->h80.payload, shs);
-                       if (unlikely(rv != shs)) {
-                               if (!signal_pending(current))
-                                       dev_warn(DEV, "short read while reading sub header: rv=%d\n", rv);
+                       err = drbd_recv_all_warn(tconn, pi.data, shs);
+                       if (err)
                                goto err_out;
-                       }
+                       pi.size -= shs;
                }
  
-               rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
-               if (unlikely(!rv)) {
-                       dev_err(DEV, "error receiving %s, l: %d!\n",
-                           cmdname(cmd), packet_size);
+               err = cmd->fn(tconn, &pi);
+               if (err) {
+                       conn_err(tconn, "error receiving %s, e: %d l: %d!\n",
+                                cmdname(pi.cmd), err, pi.size);
                        goto err_out;
                }
        }
+       return;
  
-       if (0) {
-       err_out:
-               drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
-       }
-       /* If we leave here, we probably want to update at least the
-        * "Connected" indicator on stable storage. Do so explicitly here. */
-       drbd_md_sync(mdev);
+     err_out:
+       conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
  }
  
- void drbd_flush_workqueue(struct drbd_conf *mdev)
+ void conn_flush_workqueue(struct drbd_tconn *tconn)
  {
        struct drbd_wq_barrier barr;
  
        barr.w.cb = w_prev_work_done;
+       barr.w.tconn = tconn;
        init_completion(&barr.done);
-       drbd_queue_work(&mdev->data.work, &barr.w);
+       drbd_queue_work(&tconn->sender_work, &barr.w);
        wait_for_completion(&barr.done);
  }
  
- void drbd_free_tl_hash(struct drbd_conf *mdev)
- {
-       struct hlist_head *h;
-       spin_lock_irq(&mdev->req_lock);
-       if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
-               spin_unlock_irq(&mdev->req_lock);
-               return;
-       }
-       /* paranoia code */
-       for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
-               if (h->first)
-                       dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
-                               (int)(h - mdev->ee_hash), h->first);
-       kfree(mdev->ee_hash);
-       mdev->ee_hash = NULL;
-       mdev->ee_hash_s = 0;
-       /* We may not have had the chance to wait for all locally pending
-        * application requests. The hlist_add_fake() prevents access after
-        * free on master bio completion. */
-       for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) {
-               struct drbd_request *req;
-               struct hlist_node *pos, *n;
-               hlist_for_each_entry_safe(req, pos, n, h, collision) {
-                       hlist_del_init(&req->collision);
-                       hlist_add_fake(&req->collision);
-               }
-       }
-       kfree(mdev->tl_hash);
-       mdev->tl_hash = NULL;
-       mdev->tl_hash_s = 0;
-       spin_unlock_irq(&mdev->req_lock);
- }
- static void drbd_disconnect(struct drbd_conf *mdev)
+ static void conn_disconnect(struct drbd_tconn *tconn)
  {
-       enum drbd_fencing_p fp;
-       union drbd_state os, ns;
-       int rv = SS_UNKNOWN_ERROR;
-       unsigned int i;
+       struct drbd_conf *mdev;
+       enum drbd_conns oc;
+       int vnr;
  
-       if (mdev->state.conn == C_STANDALONE)
+       if (tconn->cstate == C_STANDALONE)
                return;
  
        /* We are about to start the cleanup after connection loss.
         * Usually we should be in some network failure state already,
         * but just in case we are not, we fix it up here.
         */
-       drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+       conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
  
        /* asender does not clean up anything. it must not interfere, either */
-       drbd_thread_stop(&mdev->asender);
-       drbd_free_sock(mdev);
+       drbd_thread_stop(&tconn->asender);
+       drbd_free_sock(tconn);
+       rcu_read_lock();
+       idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+               kref_get(&mdev->kref);
+               rcu_read_unlock();
+               drbd_disconnected(mdev);
+               kref_put(&mdev->kref, &drbd_minor_destroy);
+               rcu_read_lock();
+       }
+       rcu_read_unlock();
+       if (!list_empty(&tconn->current_epoch->list))
+               conn_err(tconn, "ASSERTION FAILED: tconn->current_epoch->list not empty\n");
+       /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
+       atomic_set(&tconn->current_epoch->epoch_size, 0);
+       tconn->send.seen_any_write_yet = false;
+       conn_info(tconn, "Connection closed\n");
+       if (conn_highest_role(tconn) == R_PRIMARY && conn_highest_pdsk(tconn) >= D_UNKNOWN)
+               conn_try_outdate_peer_async(tconn);
+       spin_lock_irq(&tconn->req_lock);
+       oc = tconn->cstate;
+       if (oc >= C_UNCONNECTED)
+               _conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+       spin_unlock_irq(&tconn->req_lock);
+       if (oc == C_DISCONNECTING)
+               conn_request_state(tconn, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
+ }
+ static int drbd_disconnected(struct drbd_conf *mdev)
+ {
+       unsigned int i;
  
        /* wait for current activity to cease. */
-       spin_lock_irq(&mdev->req_lock);
+       spin_lock_irq(&mdev->tconn->req_lock);
        _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
        _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
        _drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
-       spin_unlock_irq(&mdev->req_lock);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
        /* We do not have data structures that would allow us to
         * get the rs_pending_cnt down to 0 again.
        atomic_set(&mdev->rs_pending_cnt, 0);
        wake_up(&mdev->misc_wait);
  
-       /* make sure syncer is stopped and w_resume_next_sg queued */
        del_timer_sync(&mdev->resync_timer);
        resync_timer_fn((unsigned long)mdev);
  
         * to be "canceled" */
        drbd_flush_workqueue(mdev);
  
-       /* This also does reclaim_net_ee().  If we do this too early, we might
-        * miss some resync ee and pages.*/
-       drbd_process_done_ee(mdev);
+       drbd_finish_peer_reqs(mdev);
+       /* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
+          might have issued a work again. The one before drbd_finish_peer_reqs() is
+          necessary to reclain net_ee in drbd_finish_peer_reqs(). */
+       drbd_flush_workqueue(mdev);
+       /* need to do it again, drbd_finish_peer_reqs() may have populated it
+        * again via drbd_try_clear_on_disk_bm(). */
+       drbd_rs_cancel_all(mdev);
  
        kfree(mdev->p_uuid);
        mdev->p_uuid = NULL;
  
-       if (!is_susp(mdev->state))
-               tl_clear(mdev);
-       dev_info(DEV, "Connection closed\n");
+       if (!drbd_suspended(mdev))
+               tl_clear(mdev->tconn);
  
        drbd_md_sync(mdev);
  
-       fp = FP_DONT_CARE;
-       if (get_ldev(mdev)) {
-               fp = mdev->ldev->dc.fencing;
-               put_ldev(mdev);
-       }
-       if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
-               drbd_try_outdate_peer_async(mdev);
-       spin_lock_irq(&mdev->req_lock);
-       os = mdev->state;
-       if (os.conn >= C_UNCONNECTED) {
-               /* Do not restart in case we are C_DISCONNECTING */
-               ns = os;
-               ns.conn = C_UNCONNECTED;
-               rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
-       }
-       spin_unlock_irq(&mdev->req_lock);
-       if (os.conn == C_DISCONNECTING) {
-               wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
-               crypto_free_hash(mdev->cram_hmac_tfm);
-               mdev->cram_hmac_tfm = NULL;
-               kfree(mdev->net_conf);
-               mdev->net_conf = NULL;
-               drbd_request_state(mdev, NS(conn, C_STANDALONE));
-       }
        /* serialize with bitmap writeout triggered by the state change,
         * if any. */
-       wait_event(mdev->misc_wait, !drbd_test_flag(mdev, BITMAP_IO));
+       wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
  
        /* tcp_close and release of sendpage pages can be deferred.  I don't
         * want to use SO_LINGER, because apparently it can be deferred for
         * Actually we don't care for exactly when the network stack does its
         * put_page(), but release our reference on these pages right here.
         */
-       i = drbd_release_ee(mdev, &mdev->net_ee);
+       i = drbd_free_peer_reqs(mdev, &mdev->net_ee);
        if (i)
                dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
        i = atomic_read(&mdev->pp_in_use_by_net);
        D_ASSERT(list_empty(&mdev->sync_ee));
        D_ASSERT(list_empty(&mdev->done_ee));
  
-       /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
-       atomic_set(&mdev->current_epoch->epoch_size, 0);
-       D_ASSERT(list_empty(&mdev->current_epoch->list));
+       return 0;
  }
  
  /*
   *
   * for now, they are expected to be zero, but ignored.
   */
- static int drbd_send_handshake(struct drbd_conf *mdev)
+ static int drbd_send_features(struct drbd_tconn *tconn)
  {
-       /* ASSERT current == mdev->receiver ... */
-       struct p_handshake *p = &mdev->data.sbuf.handshake;
-       int ok;
-       if (mutex_lock_interruptible(&mdev->data.mutex)) {
-               dev_err(DEV, "interrupted during initial handshake\n");
-               return 0; /* interrupted. not ok. */
-       }
-       if (mdev->data.socket == NULL) {
-               mutex_unlock(&mdev->data.mutex);
-               return 0;
-       }
+       struct drbd_socket *sock;
+       struct p_connection_features *p;
  
+       sock = &tconn->data;
+       p = conn_prepare_command(tconn, sock);
+       if (!p)
+               return -EIO;
        memset(p, 0, sizeof(*p));
        p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
        p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
-       ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
-                            (struct p_header80 *)p, sizeof(*p), 0 );
-       mutex_unlock(&mdev->data.mutex);
-       return ok;
+       return conn_send_command(tconn, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
  }
  
  /*
   *  -1 peer talks different language,
   *     no point in trying again, please go standalone.
   */
- static int drbd_do_handshake(struct drbd_conf *mdev)
+ static int drbd_do_features(struct drbd_tconn *tconn)
  {
-       /* ASSERT current == mdev->receiver ... */
-       struct p_handshake *p = &mdev->data.rbuf.handshake;
-       const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
-       unsigned int length;
-       enum drbd_packets cmd;
-       int rv;
+       /* ASSERT current == tconn->receiver ... */
+       struct p_connection_features *p;
+       const int expect = sizeof(struct p_connection_features);
+       struct packet_info pi;
+       int err;
  
-       rv = drbd_send_handshake(mdev);
-       if (!rv)
+       err = drbd_send_features(tconn);
+       if (err)
                return 0;
  
-       rv = drbd_recv_header(mdev, &cmd, &length);
-       if (!rv)
+       err = drbd_recv_header(tconn, &pi);
+       if (err)
                return 0;
  
-       if (cmd != P_HAND_SHAKE) {
-               dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
-                    cmdname(cmd), cmd);
+       if (pi.cmd != P_CONNECTION_FEATURES) {
+               conn_err(tconn, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
+                        cmdname(pi.cmd), pi.cmd);
                return -1;
        }
  
-       if (length != expect) {
-               dev_err(DEV, "expected HandShake length: %u, received: %u\n",
-                    expect, length);
+       if (pi.size != expect) {
+               conn_err(tconn, "expected ConnectionFeatures length: %u, received: %u\n",
+                    expect, pi.size);
                return -1;
        }
  
-       rv = drbd_recv(mdev, &p->head.payload, expect);
-       if (rv != expect) {
-               if (!signal_pending(current))
-                       dev_warn(DEV, "short read receiving handshake packet: l=%u\n", rv);
+       p = pi.data;
+       err = drbd_recv_all_warn(tconn, p, expect);
+       if (err)
                return 0;
-       }
  
        p->protocol_min = be32_to_cpu(p->protocol_min);
        p->protocol_max = be32_to_cpu(p->protocol_max);
            PRO_VERSION_MIN > p->protocol_max)
                goto incompat;
  
-       mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+       tconn->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
  
-       dev_info(DEV, "Handshake successful: "
-            "Agreed network protocol version %d\n", mdev->agreed_pro_version);
+       conn_info(tconn, "Handshake successful: "
+            "Agreed network protocol version %d\n", tconn->agreed_pro_version);
  
        return 1;
  
   incompat:
-       dev_err(DEV, "incompatible DRBD dialects: "
+       conn_err(tconn, "incompatible DRBD dialects: "
            "I support %d-%d, peer supports %d-%d\n",
            PRO_VERSION_MIN, PRO_VERSION_MAX,
            p->protocol_min, p->protocol_max);
  }
  
  #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
- static int drbd_do_auth(struct drbd_conf *mdev)
+ static int drbd_do_auth(struct drbd_tconn *tconn)
  {
        dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
        dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
        -1 - auth failed, don't try again.
  */
  
- static int drbd_do_auth(struct drbd_conf *mdev)
+ static int drbd_do_auth(struct drbd_tconn *tconn)
  {
+       struct drbd_socket *sock;
        char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
        struct scatterlist sg;
        char *response = NULL;
        char *right_response = NULL;
        char *peers_ch = NULL;
-       unsigned int key_len = strlen(mdev->net_conf->shared_secret);
+       unsigned int key_len;
+       char secret[SHARED_SECRET_MAX]; /* 64 byte */
        unsigned int resp_size;
        struct hash_desc desc;
-       enum drbd_packets cmd;
-       unsigned int length;
-       int rv;
+       struct packet_info pi;
+       struct net_conf *nc;
+       int err, rv;
+       /* FIXME: Put the challenge/response into the preallocated socket buffer.  */
  
-       desc.tfm = mdev->cram_hmac_tfm;
+       rcu_read_lock();
+       nc = rcu_dereference(tconn->net_conf);
+       key_len = strlen(nc->shared_secret);
+       memcpy(secret, nc->shared_secret, key_len);
+       rcu_read_unlock();
+       desc.tfm = tconn->cram_hmac_tfm;
        desc.flags = 0;
  
-       rv = crypto_hash_setkey(mdev->cram_hmac_tfm,
-                               (u8 *)mdev->net_conf->shared_secret, key_len);
+       rv = crypto_hash_setkey(tconn->cram_hmac_tfm, (u8 *)secret, key_len);
        if (rv) {
-               dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv);
+               conn_err(tconn, "crypto_hash_setkey() failed with %d\n", rv);
                rv = -1;
                goto fail;
        }
  
        get_random_bytes(my_challenge, CHALLENGE_LEN);
  
-       rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
+       sock = &tconn->data;
+       if (!conn_prepare_command(tconn, sock)) {
+               rv = 0;
+               goto fail;
+       }
+       rv = !conn_send_command(tconn, sock, P_AUTH_CHALLENGE, 0,
+                               my_challenge, CHALLENGE_LEN);
        if (!rv)
                goto fail;
  
-       rv = drbd_recv_header(mdev, &cmd, &length);
-       if (!rv)
+       err = drbd_recv_header(tconn, &pi);
+       if (err) {
+               rv = 0;
                goto fail;
+       }
  
-       if (cmd != P_AUTH_CHALLENGE) {
-               dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
-                   cmdname(cmd), cmd);
+       if (pi.cmd != P_AUTH_CHALLENGE) {
+               conn_err(tconn, "expected AuthChallenge packet, received: %s (0x%04x)\n",
+                        cmdname(pi.cmd), pi.cmd);
                rv = 0;
                goto fail;
        }
  
-       if (length > CHALLENGE_LEN * 2) {
-               dev_err(DEV, "expected AuthChallenge payload too big.\n");
+       if (pi.size > CHALLENGE_LEN * 2) {
+               conn_err(tconn, "expected AuthChallenge payload too big.\n");
                rv = -1;
                goto fail;
        }
  
-       peers_ch = kmalloc(length, GFP_NOIO);
+       peers_ch = kmalloc(pi.size, GFP_NOIO);
        if (peers_ch == NULL) {
-               dev_err(DEV, "kmalloc of peers_ch failed\n");
+               conn_err(tconn, "kmalloc of peers_ch failed\n");
                rv = -1;
                goto fail;
        }
  
-       rv = drbd_recv(mdev, peers_ch, length);
-       if (rv != length) {
-               if (!signal_pending(current))
-                       dev_warn(DEV, "short read AuthChallenge: l=%u\n", rv);
+       err = drbd_recv_all_warn(tconn, peers_ch, pi.size);
+       if (err) {
                rv = 0;
                goto fail;
        }
  
-       resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm);
+       resp_size = crypto_hash_digestsize(tconn->cram_hmac_tfm);
        response = kmalloc(resp_size, GFP_NOIO);
        if (response == NULL) {
-               dev_err(DEV, "kmalloc of response failed\n");
+               conn_err(tconn, "kmalloc of response failed\n");
                rv = -1;
                goto fail;
        }
  
        sg_init_table(&sg, 1);
-       sg_set_buf(&sg, peers_ch, length);
+       sg_set_buf(&sg, peers_ch, pi.size);
  
        rv = crypto_hash_digest(&desc, &sg, sg.length, response);
        if (rv) {
-               dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
+               conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv);
                rv = -1;
                goto fail;
        }
  
-       rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size);
-       if (!rv)
+       if (!conn_prepare_command(tconn, sock)) {
+               rv = 0;
                goto fail;
-       rv = drbd_recv_header(mdev, &cmd, &length);
+       }
+       rv = !conn_send_command(tconn, sock, P_AUTH_RESPONSE, 0,
+                               response, resp_size);
        if (!rv)
                goto fail;
  
-       if (cmd != P_AUTH_RESPONSE) {
-               dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
-                       cmdname(cmd), cmd);
+       err = drbd_recv_header(tconn, &pi);
+       if (err) {
                rv = 0;
                goto fail;
        }
  
-       if (length != resp_size) {
-               dev_err(DEV, "expected AuthResponse payload of wrong size\n");
+       if (pi.cmd != P_AUTH_RESPONSE) {
+               conn_err(tconn, "expected AuthResponse packet, received: %s (0x%04x)\n",
+                        cmdname(pi.cmd), pi.cmd);
                rv = 0;
                goto fail;
        }
  
-       rv = drbd_recv(mdev, response , resp_size);
+       if (pi.size != resp_size) {
+               conn_err(tconn, "expected AuthResponse payload of wrong size\n");
+               rv = 0;
+               goto fail;
+       }
  
-       if (rv != resp_size) {
-               if (!signal_pending(current))
-                       dev_warn(DEV, "short read receiving AuthResponse: l=%u\n", rv);
+       err = drbd_recv_all_warn(tconn, response , resp_size);
+       if (err) {
                rv = 0;
                goto fail;
        }
  
        right_response = kmalloc(resp_size, GFP_NOIO);
        if (right_response == NULL) {
-               dev_err(DEV, "kmalloc of right_response failed\n");
+               conn_err(tconn, "kmalloc of right_response failed\n");
                rv = -1;
                goto fail;
        }
  
        rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
        if (rv) {
-               dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
+               conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv);
                rv = -1;
                goto fail;
        }
        rv = !memcmp(response, right_response, resp_size);
  
        if (rv)
-               dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n",
-                    resp_size, mdev->net_conf->cram_hmac_alg);
+               conn_info(tconn, "Peer authenticated using %d bytes HMAC\n",
+                    resp_size);
        else
                rv = -1;
  
  
  int drbdd_init(struct drbd_thread *thi)
  {
-       struct drbd_conf *mdev = thi->mdev;
-       unsigned int minor = mdev_to_minor(mdev);
+       struct drbd_tconn *tconn = thi->tconn;
        int h;
  
-       sprintf(current->comm, "drbd%d_receiver", minor);
-       dev_info(DEV, "receiver (re)started\n");
+       conn_info(tconn, "receiver (re)started\n");
  
        do {
-               h = drbd_connect(mdev);
+               h = conn_connect(tconn);
                if (h == 0) {
-                       drbd_disconnect(mdev);
+                       conn_disconnect(tconn);
                        schedule_timeout_interruptible(HZ);
                }
                if (h == -1) {
-                       dev_warn(DEV, "Discarding network configuration.\n");
-                       drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+                       conn_warn(tconn, "Discarding network configuration.\n");
+                       conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
                }
        } while (h == 0);
  
-       if (h > 0) {
-               if (get_net_conf(mdev)) {
-                       drbdd(mdev);
-                       put_net_conf(mdev);
-               }
-       }
+       if (h > 0)
+               drbdd(tconn);
  
-       drbd_disconnect(mdev);
+       conn_disconnect(tconn);
  
-       dev_info(DEV, "receiver terminated\n");
+       conn_info(tconn, "receiver terminated\n");
        return 0;
  }
  
  /* ********* acknowledge sender ******** */
  
- static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
+ static int got_conn_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi)
  {
-       struct p_req_state_reply *p = (struct p_req_state_reply *)h;
+       struct p_req_state_reply *p = pi->data;
+       int retcode = be32_to_cpu(p->retcode);
+       if (retcode >= SS_SUCCESS) {
+               set_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags);
+       } else {
+               set_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags);
+               conn_err(tconn, "Requested state change failed by peer: %s (%d)\n",
+                        drbd_set_st_err_str(retcode), retcode);
+       }
+       wake_up(&tconn->ping_wait);
+       return 0;
+ }
  
+ static int got_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi)
+ {
+       struct drbd_conf *mdev;
+       struct p_req_state_reply *p = pi->data;
        int retcode = be32_to_cpu(p->retcode);
  
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return -EIO;
+       if (test_bit(CONN_WD_ST_CHG_REQ, &tconn->flags)) {
+               D_ASSERT(tconn->agreed_pro_version < 100);
+               return got_conn_RqSReply(tconn, pi);
+       }
        if (retcode >= SS_SUCCESS) {
-               drbd_set_flag(mdev, CL_ST_CHG_SUCCESS);
+               set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
        } else {
-               drbd_set_flag(mdev, CL_ST_CHG_FAIL);
+               set_bit(CL_ST_CHG_FAIL, &mdev->flags);
                dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
-                   drbd_set_st_err_str(retcode), retcode);
+                       drbd_set_st_err_str(retcode), retcode);
        }
        wake_up(&mdev->state_wait);
  
-       return true;
+       return 0;
  }
  
- static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
+ static int got_Ping(struct drbd_tconn *tconn, struct packet_info *pi)
  {
-       return drbd_send_ping_ack(mdev);
+       return drbd_send_ping_ack(tconn);
  
  }
  
- static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
+ static int got_PingAck(struct drbd_tconn *tconn, struct packet_info *pi)
  {
        /* restore idle timeout */
-       mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
-       if (!drbd_test_and_set_flag(mdev, GOT_PING_ACK))
-               wake_up(&mdev->misc_wait);
+       tconn->meta.socket->sk->sk_rcvtimeo = tconn->net_conf->ping_int*HZ;
+       if (!test_and_set_bit(GOT_PING_ACK, &tconn->flags))
+               wake_up(&tconn->ping_wait);
  
-       return true;
+       return 0;
  }
  
- static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
+ static int got_IsInSync(struct drbd_tconn *tconn, struct packet_info *pi)
  {
-       struct p_block_ack *p = (struct p_block_ack *)h;
+       struct drbd_conf *mdev;
+       struct p_block_ack *p = pi->data;
        sector_t sector = be64_to_cpu(p->sector);
        int blksize = be32_to_cpu(p->blksize);
  
-       D_ASSERT(mdev->agreed_pro_version >= 89);
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return -EIO;
+       D_ASSERT(mdev->tconn->agreed_pro_version >= 89);
  
        update_peer_seq(mdev, be32_to_cpu(p->seq_num));
  
        dec_rs_pending(mdev);
        atomic_add(blksize >> 9, &mdev->rs_sect_in);
  
-       return true;
- }
- /* when we receive the ACK for a write request,
-  * verify that we actually know about it */
- static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
-       u64 id, sector_t sector)
- {
-       struct hlist_head *slot = tl_hash_slot(mdev, sector);
-       struct hlist_node *n;
-       struct drbd_request *req;
-       hlist_for_each_entry(req, n, slot, collision) {
-               if ((unsigned long)req == (unsigned long)id) {
-                       if (req->sector != sector) {
-                               dev_err(DEV, "_ack_id_to_req: found req %p but it has "
-                                   "wrong sector (%llus versus %llus)\n", req,
-                                   (unsigned long long)req->sector,
-                                   (unsigned long long)sector);
-                               break;
-                       }
-                       return req;
-               }
-       }
-       return NULL;
+       return 0;
  }
  
- typedef struct drbd_request *(req_validator_fn)
-       (struct drbd_conf *mdev, u64 id, sector_t sector);
- static int validate_req_change_req_state(struct drbd_conf *mdev,
-       u64 id, sector_t sector, req_validator_fn validator,
-       const char *func, enum drbd_req_event what)
+ static int
+ validate_req_change_req_state(struct drbd_conf *mdev, u64 id, sector_t sector,
+                             struct rb_root *root, const char *func,
+                             enum drbd_req_event what, bool missing_ok)
  {
        struct drbd_request *req;
        struct bio_and_error m;
  
-       spin_lock_irq(&mdev->req_lock);
-       req = validator(mdev, id, sector);
+       spin_lock_irq(&mdev->tconn->req_lock);
+       req = find_request(mdev, root, id, sector, missing_ok, func);
        if (unlikely(!req)) {
-               spin_unlock_irq(&mdev->req_lock);
-               dev_err(DEV, "%s: failed to find req %p, sector %llus\n", func,
-                       (void *)(unsigned long)id, (unsigned long long)sector);
-               return false;
+               spin_unlock_irq(&mdev->tconn->req_lock);
+               return -EIO;
        }
        __req_mod(req, what, &m);
-       spin_unlock_irq(&mdev->req_lock);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
        if (m.bio)
                complete_master_bio(mdev, &m);
-       return true;
+       return 0;
  }
  
- static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
+ static int got_BlockAck(struct drbd_tconn *tconn, struct packet_info *pi)
  {
-       struct p_block_ack *p = (struct p_block_ack *)h;
+       struct drbd_conf *mdev;
+       struct p_block_ack *p = pi->data;
        sector_t sector = be64_to_cpu(p->sector);
        int blksize = be32_to_cpu(p->blksize);
        enum drbd_req_event what;
  
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return -EIO;
        update_peer_seq(mdev, be32_to_cpu(p->seq_num));
  
-       if (is_syncer_block_id(p->block_id)) {
+       if (p->block_id == ID_SYNCER) {
                drbd_set_in_sync(mdev, sector, blksize);
                dec_rs_pending(mdev);
-               return true;
+               return 0;
        }
-       switch (be16_to_cpu(h->command)) {
+       switch (pi->cmd) {
        case P_RS_WRITE_ACK:
-               D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
-               what = write_acked_by_peer_and_sis;
+               what = WRITE_ACKED_BY_PEER_AND_SIS;
                break;
        case P_WRITE_ACK:
-               D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
-               what = write_acked_by_peer;
+               what = WRITE_ACKED_BY_PEER;
                break;
        case P_RECV_ACK:
-               D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B);
-               what = recv_acked_by_peer;
+               what = RECV_ACKED_BY_PEER;
+               break;
+       case P_SUPERSEDED:
+               what = CONFLICT_RESOLVED;
                break;
-       case P_DISCARD_ACK:
-               D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
-               what = conflict_discarded_by_peer;
+       case P_RETRY_WRITE:
+               what = POSTPONE_WRITE;
                break;
        default:
-               D_ASSERT(0);
-               return false;
+               BUG();
        }
  
        return validate_req_change_req_state(mdev, p->block_id, sector,
-               _ack_id_to_req, __func__ , what);
+                                            &mdev->write_requests, __func__,
+                                            what, false);
  }
  
- static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
+ static int got_NegAck(struct drbd_tconn *tconn, struct packet_info *pi)
  {
-       struct p_block_ack *p = (struct p_block_ack *)h;
+       struct drbd_conf *mdev;
+       struct p_block_ack *p = pi->data;
        sector_t sector = be64_to_cpu(p->sector);
        int size = be32_to_cpu(p->blksize);
-       struct drbd_request *req;
-       struct bio_and_error m;
+       int err;
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return -EIO;
  
        update_peer_seq(mdev, be32_to_cpu(p->seq_num));
  
-       if (is_syncer_block_id(p->block_id)) {
+       if (p->block_id == ID_SYNCER) {
                dec_rs_pending(mdev);
                drbd_rs_failed_io(mdev, sector, size);
-               return true;
+               return 0;
        }
  
-       spin_lock_irq(&mdev->req_lock);
-       req = _ack_id_to_req(mdev, p->block_id, sector);
-       if (!req) {
-               spin_unlock_irq(&mdev->req_lock);
-               if (mdev->net_conf->wire_protocol == DRBD_PROT_A ||
-                   mdev->net_conf->wire_protocol == DRBD_PROT_B) {
-                       /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
-                          The master bio might already be completed, therefore the
-                          request is no longer in the collision hash.
-                          => Do not try to validate block_id as request. */
-                       /* In Protocol B we might already have got a P_RECV_ACK
-                          but then get a P_NEG_ACK after wards. */
-                       drbd_set_out_of_sync(mdev, sector, size);
-                       return true;
-               } else {
-                       dev_err(DEV, "%s: failed to find req %p, sector %llus\n", __func__,
-                               (void *)(unsigned long)p->block_id, (unsigned long long)sector);
-                       return false;
-               }
+       err = validate_req_change_req_state(mdev, p->block_id, sector,
+                                           &mdev->write_requests, __func__,
+                                           NEG_ACKED, true);
+       if (err) {
+               /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
+                  The master bio might already be completed, therefore the
+                  request is no longer in the collision hash. */
+               /* In Protocol B we might already have got a P_RECV_ACK
+                  but then get a P_NEG_ACK afterwards. */
+               drbd_set_out_of_sync(mdev, sector, size);
        }
-       __req_mod(req, neg_acked, &m);
-       spin_unlock_irq(&mdev->req_lock);
-       if (m.bio)
-               complete_master_bio(mdev, &m);
-       return true;
+       return 0;
  }
  
- static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
+ static int got_NegDReply(struct drbd_tconn *tconn, struct packet_info *pi)
  {
-       struct p_block_ack *p = (struct p_block_ack *)h;
+       struct drbd_conf *mdev;
+       struct p_block_ack *p = pi->data;
        sector_t sector = be64_to_cpu(p->sector);
  
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return -EIO;
        update_peer_seq(mdev, be32_to_cpu(p->seq_num));
-       dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
+       dev_err(DEV, "Got NegDReply; Sector %llus, len %u.\n",
            (unsigned long long)sector, be32_to_cpu(p->blksize));
  
        return validate_req_change_req_state(mdev, p->block_id, sector,
-               _ar_id_to_req, __func__ , neg_acked);
+                                            &mdev->read_requests, __func__,
+                                            NEG_ACKED, false);
  }
  
- static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
+ static int got_NegRSDReply(struct drbd_tconn *tconn, struct packet_info *pi)
  {
+       struct drbd_conf *mdev;
        sector_t sector;
        int size;
-       struct p_block_ack *p = (struct p_block_ack *)h;
+       struct p_block_ack *p = pi->data;
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return -EIO;
  
        sector = be64_to_cpu(p->sector);
        size = be32_to_cpu(p->blksize);
  
        if (get_ldev_if_state(mdev, D_FAILED)) {
                drbd_rs_complete_io(mdev, sector);
-               switch (be16_to_cpu(h->command)) {
+               switch (pi->cmd) {
                case P_NEG_RS_DREPLY:
                        drbd_rs_failed_io(mdev, sector, size);
                case P_RS_CANCEL:
                        break;
                default:
-                       D_ASSERT(0);
-                       put_ldev(mdev);
-                       return false;
+                       BUG();
                }
                put_ldev(mdev);
        }
  
-       return true;
+       return 0;
  }
  
- static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
+ static int got_BarrierAck(struct drbd_tconn *tconn, struct packet_info *pi)
  {
-       struct p_barrier_ack *p = (struct p_barrier_ack *)h;
-       tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
-       if (mdev->state.conn == C_AHEAD &&
-           atomic_read(&mdev->ap_in_flight) == 0 &&
-           !drbd_test_and_set_flag(mdev, AHEAD_TO_SYNC_SOURCE)) {
-               mdev->start_resync_timer.expires = jiffies + HZ;
-               add_timer(&mdev->start_resync_timer);
+       struct p_barrier_ack *p = pi->data;
+       struct drbd_conf *mdev;
+       int vnr;
+       tl_release(tconn, p->barrier, be32_to_cpu(p->set_size));
+       rcu_read_lock();
+       idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+               if (mdev->state.conn == C_AHEAD &&
+                   atomic_read(&mdev->ap_in_flight) == 0 &&
+                   !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) {
+                       mdev->start_resync_timer.expires = jiffies + HZ;
+                       add_timer(&mdev->start_resync_timer);
+               }
        }
+       rcu_read_unlock();
  
-       return true;
+       return 0;
  }
  
- static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
+ static int got_OVResult(struct drbd_tconn *tconn, struct packet_info *pi)
  {
-       struct p_block_ack *p = (struct p_block_ack *)h;
+       struct drbd_conf *mdev;
+       struct p_block_ack *p = pi->data;
        struct drbd_work *w;
        sector_t sector;
        int size;
  
+       mdev = vnr_to_mdev(tconn, pi->vnr);
+       if (!mdev)
+               return -EIO;
        sector = be64_to_cpu(p->sector);
        size = be32_to_cpu(p->blksize);
  
        update_peer_seq(mdev, be32_to_cpu(p->seq_num));
  
        if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
-               drbd_ov_oos_found(mdev, sector, size);
+               drbd_ov_out_of_sync_found(mdev, sector, size);
        else
-               ov_oos_print(mdev);
+               ov_out_of_sync_print(mdev);
  
        if (!get_ldev(mdev))
-               return true;
+               return 0;
  
        drbd_rs_complete_io(mdev, sector);
        dec_rs_pending(mdev);
                w = kmalloc(sizeof(*w), GFP_NOIO);
                if (w) {
                        w->cb = w_ov_finished;
-                       drbd_queue_work_front(&mdev->data.work, w);
+                       w->mdev = mdev;
+                       drbd_queue_work(&mdev->tconn->sender_work, w);
                } else {
                        dev_err(DEV, "kmalloc(w) failed.");
-                       ov_oos_print(mdev);
+                       ov_out_of_sync_print(mdev);
                        drbd_resync_finished(mdev);
                }
        }
        put_ldev(mdev);
-       return true;
+       return 0;
+ }
+ static int got_skip(struct drbd_tconn *tconn, struct packet_info *pi)
+ {
+       return 0;
  }
  
- static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
+ static int tconn_finish_peer_reqs(struct drbd_tconn *tconn)
  {
-       return true;
+       struct drbd_conf *mdev;
+       int vnr, not_empty = 0;
+       do {
+               clear_bit(SIGNAL_ASENDER, &tconn->flags);
+               flush_signals(current);
+               rcu_read_lock();
+               idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+                       kref_get(&mdev->kref);
+                       rcu_read_unlock();
+                       if (drbd_finish_peer_reqs(mdev)) {
+                               kref_put(&mdev->kref, &drbd_minor_destroy);
+                               return 1;
+                       }
+                       kref_put(&mdev->kref, &drbd_minor_destroy);
+                       rcu_read_lock();
+               }
+               set_bit(SIGNAL_ASENDER, &tconn->flags);
+               spin_lock_irq(&tconn->req_lock);
+               idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+                       not_empty = !list_empty(&mdev->done_ee);
+                       if (not_empty)
+                               break;
+               }
+               spin_unlock_irq(&tconn->req_lock);
+               rcu_read_unlock();
+       } while (not_empty);
+       return 0;
  }
  
  struct asender_cmd {
        size_t pkt_size;
-       int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
+       int (*fn)(struct drbd_tconn *tconn, struct packet_info *);
  };
  
- static struct asender_cmd *get_asender_cmd(int cmd)
- {
-       static struct asender_cmd asender_tbl[] = {
-               /* anything missing from this table is in
-                * the drbd_cmd_handler (drbd_default_handler) table,
-                * see the beginning of drbdd() */
-       [P_PING]            = { sizeof(struct p_header80), got_Ping },
-       [P_PING_ACK]        = { sizeof(struct p_header80), got_PingAck },
+ static struct asender_cmd asender_tbl[] = {
+       [P_PING]            = { 0, got_Ping },
+       [P_PING_ACK]        = { 0, got_PingAck },
        [P_RECV_ACK]        = { sizeof(struct p_block_ack), got_BlockAck },
        [P_WRITE_ACK]       = { sizeof(struct p_block_ack), got_BlockAck },
        [P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
-       [P_DISCARD_ACK]     = { sizeof(struct p_block_ack), got_BlockAck },
+       [P_SUPERSEDED]   = { sizeof(struct p_block_ack), got_BlockAck },
        [P_NEG_ACK]         = { sizeof(struct p_block_ack), got_NegAck },
        [P_NEG_DREPLY]      = { sizeof(struct p_block_ack), got_NegDReply },
-       [P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply},
+       [P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply },
        [P_OV_RESULT]       = { sizeof(struct p_block_ack), got_OVResult },
        [P_BARRIER_ACK]     = { sizeof(struct p_barrier_ack), got_BarrierAck },
        [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
        [P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
        [P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
-       [P_RS_CANCEL]       = { sizeof(struct p_block_ack), got_NegRSDReply},
-       [P_MAX_CMD]         = { 0, NULL },
-       };
-       if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
-               return NULL;
-       return &asender_tbl[cmd];
- }
+       [P_RS_CANCEL]       = { sizeof(struct p_block_ack), got_NegRSDReply },
+       [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
+       [P_RETRY_WRITE]     = { sizeof(struct p_block_ack), got_BlockAck },
+ };
  
  int drbd_asender(struct drbd_thread *thi)
  {
-       struct drbd_conf *mdev = thi->mdev;
-       struct p_header80 *h = &mdev->meta.rbuf.header.h80;
+       struct drbd_tconn *tconn = thi->tconn;
        struct asender_cmd *cmd = NULL;
-       int rv, len;
-       void *buf    = h;
+       struct packet_info pi;
+       int rv;
+       void *buf    = tconn->meta.rbuf;
        int received = 0;
-       int expect   = sizeof(struct p_header80);
-       int empty;
-       int ping_timeout_active = 0;
-       sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
+       unsigned int header_size = drbd_header_size(tconn);
+       int expect   = header_size;
+       bool ping_timeout_active = false;
+       struct net_conf *nc;
+       int ping_timeo, tcp_cork, ping_int;
  
        current->policy = SCHED_RR;  /* Make this a realtime task! */
        current->rt_priority = 2;    /* more important than all other tasks */
  
-       while (get_t_state(thi) == Running) {
-               drbd_thread_current_set_cpu(mdev);
-               if (drbd_test_and_clear_flag(mdev, SEND_PING)) {
-                       ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
-                       mdev->meta.socket->sk->sk_rcvtimeo =
-                               mdev->net_conf->ping_timeo*HZ/10;
-                       ping_timeout_active = 1;
-               }
+       while (get_t_state(thi) == RUNNING) {
+               drbd_thread_current_set_cpu(thi);
  
-               /* conditionally cork;
-                * it may hurt latency if we cork without much to send */
-               if (!mdev->net_conf->no_cork &&
-                       3 < atomic_read(&mdev->unacked_cnt))
-                       drbd_tcp_cork(mdev->meta.socket);
-               while (1) {
-                       drbd_clear_flag(mdev, SIGNAL_ASENDER);
-                       flush_signals(current);
-                       if (!drbd_process_done_ee(mdev))
+               rcu_read_lock();
+               nc = rcu_dereference(tconn->net_conf);
+               ping_timeo = nc->ping_timeo;
+               tcp_cork = nc->tcp_cork;
+               ping_int = nc->ping_int;
+               rcu_read_unlock();
+               if (test_and_clear_bit(SEND_PING, &tconn->flags)) {
+                       if (drbd_send_ping(tconn)) {
+                               conn_err(tconn, "drbd_send_ping has failed\n");
                                goto reconnect;
-                       /* to avoid race with newly queued ACKs */
-                       drbd_set_flag(mdev, SIGNAL_ASENDER);
-                       spin_lock_irq(&mdev->req_lock);
-                       empty = list_empty(&mdev->done_ee);
-                       spin_unlock_irq(&mdev->req_lock);
-                       /* new ack may have been queued right here,
-                        * but then there is also a signal pending,
-                        * and we start over... */
-                       if (empty)
-                               break;
+                       }
+                       tconn->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
+                       ping_timeout_active = true;
+               }
+               /* TODO: conditionally cork; it may hurt latency if we cork without
+                  much to send */
+               if (tcp_cork)
+                       drbd_tcp_cork(tconn->meta.socket);
+               if (tconn_finish_peer_reqs(tconn)) {
+                       conn_err(tconn, "tconn_finish_peer_reqs() failed\n");
+                       goto reconnect;
                }
                /* but unconditionally uncork unless disabled */
-               if (!mdev->net_conf->no_cork)
-                       drbd_tcp_uncork(mdev->meta.socket);
+               if (tcp_cork)
+                       drbd_tcp_uncork(tconn->meta.socket);
  
                /* short circuit, recv_msg would return EINTR anyways. */
                if (signal_pending(current))
                        continue;
  
-               rv = drbd_recv_short(mdev, mdev->meta.socket,
-                                    buf, expect-received, 0);
-               drbd_clear_flag(mdev, SIGNAL_ASENDER);
+               rv = drbd_recv_short(tconn->meta.socket, buf, expect-received, 0);
+               clear_bit(SIGNAL_ASENDER, &tconn->flags);
  
                flush_signals(current);
  
                        received += rv;
                        buf      += rv;
                } else if (rv == 0) {
-                       if (drbd_test_flag(mdev, DISCONNECT_SENT)) {
-                               long t; /* time_left */
-                               t = wait_event_timeout(mdev->state_wait, mdev->state.conn < C_CONNECTED,
-                                                      mdev->net_conf->ping_timeo * HZ/10);
+                       if (test_bit(DISCONNECT_SENT, &tconn->flags)) {
+                               long t;
+                               rcu_read_lock();
+                               t = rcu_dereference(tconn->net_conf)->ping_timeo * HZ/10;
+                               rcu_read_unlock();
+                               t = wait_event_timeout(tconn->ping_wait,
+                                                      tconn->cstate < C_WF_REPORT_PARAMS,
+                                                      t);
                                if (t)
                                        break;
                        }
-                       dev_err(DEV, "meta connection shut down by peer.\n");
+                       conn_err(tconn, "meta connection shut down by peer.\n");
                        goto reconnect;
                } else if (rv == -EAGAIN) {
                        /* If the data socket received something meanwhile,
                         * that is good enough: peer is still alive. */
-                       if (time_after(mdev->last_received,
-                               jiffies - mdev->meta.socket->sk->sk_rcvtimeo))
+                       if (time_after(tconn->last_received,
+                               jiffies - tconn->meta.socket->sk->sk_rcvtimeo))
                                continue;
                        if (ping_timeout_active) {
-                               dev_err(DEV, "PingAck did not arrive in time.\n");
+                               conn_err(tconn, "PingAck did not arrive in time.\n");
                                goto reconnect;
                        }
-                       drbd_set_flag(mdev, SEND_PING);
+                       set_bit(SEND_PING, &tconn->flags);
                        continue;
                } else if (rv == -EINTR) {
                        continue;
                } else {
-                       dev_err(DEV, "sock_recvmsg returned %d\n", rv);
+                       conn_err(tconn, "sock_recvmsg returned %d\n", rv);
                        goto reconnect;
                }
  
                if (received == expect && cmd == NULL) {
-                       if (unlikely(h->magic != BE_DRBD_MAGIC)) {
-                               dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n",
-                                   be32_to_cpu(h->magic),
-                                   be16_to_cpu(h->command),
-                                   be16_to_cpu(h->length));
+                       if (decode_header(tconn, tconn->meta.rbuf, &pi))
                                goto reconnect;
-                       }
-                       cmd = get_asender_cmd(be16_to_cpu(h->command));
-                       len = be16_to_cpu(h->length);
-                       if (unlikely(cmd == NULL)) {
-                               dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n",
-                                   be32_to_cpu(h->magic),
-                                   be16_to_cpu(h->command),
-                                   be16_to_cpu(h->length));
+                       cmd = &asender_tbl[pi.cmd];
+                       if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
+                               conn_err(tconn, "Unexpected meta packet %s (0x%04x)\n",
+                                        cmdname(pi.cmd), pi.cmd);
                                goto disconnect;
                        }
-                       expect = cmd->pkt_size;
-                       ERR_IF(len != expect-sizeof(struct p_header80))
+                       expect = header_size + cmd->pkt_size;
+                       if (pi.size != expect - header_size) {
+                               conn_err(tconn, "Wrong packet size on meta (c: %d, l: %d)\n",
+                                       pi.cmd, pi.size);
                                goto reconnect;
+                       }
                }
                if (received == expect) {
-                       mdev->last_received = jiffies;
-                       D_ASSERT(cmd != NULL);
-                       if (!cmd->process(mdev, h))
+                       bool err;
+                       err = cmd->fn(tconn, &pi);
+                       if (err) {
+                               conn_err(tconn, "%pf failed\n", cmd->fn);
                                goto reconnect;
+                       }
  
-                       /* the idle_timeout (ping-int)
-                        * has been restored in got_PingAck() */
-                       if (cmd == get_asender_cmd(P_PING_ACK))
-                               ping_timeout_active = 0;
+                       tconn->last_received = jiffies;
  
-                       buf      = h;
+                       if (cmd == &asender_tbl[P_PING_ACK]) {
+                               /* restore idle timeout */
+                               tconn->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
+                               ping_timeout_active = false;
+                       }
+                       buf      = tconn->meta.rbuf;
                        received = 0;
-                       expect   = sizeof(struct p_header80);
+                       expect   = header_size;
                        cmd      = NULL;
                }
        }
  
        if (0) {
  reconnect:
-               drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
-               drbd_md_sync(mdev);
+               conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+               conn_md_sync(tconn);
        }
        if (0) {
  disconnect:
-               drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-               drbd_md_sync(mdev);
+               conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
        }
-       drbd_clear_flag(mdev, SIGNAL_ASENDER);
+       clear_bit(SIGNAL_ASENDER, &tconn->flags);
  
-       D_ASSERT(mdev->state.conn < C_CONNECTED);
-       dev_info(DEV, "asender terminated\n");
+       conn_info(tconn, "asender terminated\n");
  
        return 0;
  }
index 135ea76ed502f720a2e742ad54dd84927a8cfa00,b905a0453bf958df6ed539d63433d7427971c4cd..f58a4a4b4dfb3d1042113bbcbf34ee7c8280b7e8
@@@ -31,6 -31,8 +31,8 @@@
  #include "drbd_req.h"
  
  
+ static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size);
  /* Update disk stats at start of I/O request */
  static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
  {
@@@ -40,6 -42,8 +42,8 @@@
        part_round_stats(cpu, &mdev->vdisk->part0);
        part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
        part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
+       (void) cpu; /* The macro invocations above want the cpu argument, I do not like
+                      the compiler warning about cpu only assigned but never used... */
        part_inc_in_flight(&mdev->vdisk->part0, rw);
        part_stat_unlock();
  }
@@@ -57,9 -61,51 +61,51 @@@ static void _drbd_end_io_acct(struct dr
        part_stat_unlock();
  }
  
- static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
+ static struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
+                                              struct bio *bio_src)
+ {
+       struct drbd_request *req;
+       req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
+       if (!req)
+               return NULL;
+       drbd_req_make_private_bio(req, bio_src);
+       req->rq_state    = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
+       req->w.mdev      = mdev;
+       req->master_bio  = bio_src;
+       req->epoch       = 0;
+       drbd_clear_interval(&req->i);
+       req->i.sector     = bio_src->bi_sector;
+       req->i.size      = bio_src->bi_size;
+       req->i.local = true;
+       req->i.waiting = false;
+       INIT_LIST_HEAD(&req->tl_requests);
+       INIT_LIST_HEAD(&req->w.list);
+       /* one reference to be put by __drbd_make_request */
+       atomic_set(&req->completion_ref, 1);
+       /* one kref as long as completion_ref > 0 */
+       kref_init(&req->kref);
+       return req;
+ }
+ void drbd_req_destroy(struct kref *kref)
  {
-       const unsigned long s = req->rq_state;
+       struct drbd_request *req = container_of(kref, struct drbd_request, kref);
+       struct drbd_conf *mdev = req->w.mdev;
+       const unsigned s = req->rq_state;
+       if ((req->master_bio && !(s & RQ_POSTPONED)) ||
+               atomic_read(&req->completion_ref) ||
+               (s & RQ_LOCAL_PENDING) ||
+               ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) {
+               dev_err(DEV, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n",
+                               s, atomic_read(&req->completion_ref));
+               return;
+       }
  
        /* remove it from the transfer log.
         * well, only if it had been there in the first
         * and never sent), it should still be "empty" as
         * initialized in drbd_req_new(), so we can list_del() it
         * here unconditionally */
-       list_del(&req->tl_requests);
+       list_del_init(&req->tl_requests);
  
        /* if it was a write, we may have to set the corresponding
         * bit(s) out-of-sync first. If it had a local part, we need to
         * release the reference to the activity log. */
-       if (rw == WRITE) {
+       if (s & RQ_WRITE) {
                /* Set out-of-sync unless both OK flags are set
                 * (local only or remote failed).
                 * Other places where we set out-of-sync:
                 * READ with local io-error */
-               if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
-                       drbd_set_out_of_sync(mdev, req->sector, req->size);
  
-               if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
-                       drbd_set_in_sync(mdev, req->sector, req->size);
+               /* There is a special case:
+                * we may notice late that IO was suspended,
+                * and postpone, or schedule for retry, a write,
+                * before it even was submitted or sent.
+                * In that case we do not want to touch the bitmap at all.
+                */
+               if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) {
+                       if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
+                               drbd_set_out_of_sync(mdev, req->i.sector, req->i.size);
+                       if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
+                               drbd_set_in_sync(mdev, req->i.sector, req->i.size);
+               }
  
                /* one might be tempted to move the drbd_al_complete_io
-                * to the local io completion callback drbd_endio_pri.
+                * to the local io completion callback drbd_request_endio.
                 * but, if this was a mirror write, we may only
                 * drbd_al_complete_io after this is RQ_NET_DONE,
                 * otherwise the extent could be dropped from the al
                 * but after the extent has been dropped from the al,
                 * we would forget to resync the corresponding extent.
                 */
-               if (s & RQ_LOCAL_MASK) {
+               if (s & RQ_IN_ACT_LOG) {
                        if (get_ldev_if_state(mdev, D_FAILED)) {
-                               if (s & RQ_IN_ACT_LOG)
-                                       drbd_al_complete_io(mdev, req->sector);
+                               drbd_al_complete_io(mdev, &req->i);
                                put_ldev(mdev);
                        } else if (__ratelimit(&drbd_ratelimit_state)) {
-                               dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
-                                    "but my Disk seems to have failed :(\n",
-                                    (unsigned long long) req->sector);
+                               dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu, %u), "
+                                        "but my Disk seems to have failed :(\n",
+                                        (unsigned long long) req->i.sector, req->i.size);
                        }
                }
        }
  
-       drbd_req_free(req);
+       mempool_free(req, drbd_request_mempool);
  }
  
- static void queue_barrier(struct drbd_conf *mdev)
- {
-       struct drbd_tl_epoch *b;
-       /* We are within the req_lock. Once we queued the barrier for sending,
-        * we set the CREATE_BARRIER bit. It is cleared as soon as a new
-        * barrier/epoch object is added. This is the only place this bit is
-        * set. It indicates that the barrier for this epoch is already queued,
-        * and no new epoch has been created yet. */
-       if (drbd_test_flag(mdev, CREATE_BARRIER))
-               return;
-       b = mdev->newest_tle;
-       b->w.cb = w_send_barrier;
-       /* inc_ap_pending done here, so we won't
-        * get imbalanced on connection loss.
-        * dec_ap_pending will be done in got_BarrierAck
-        * or (on connection loss) in tl_clear.  */
-       inc_ap_pending(mdev);
-       drbd_queue_work(&mdev->data.work, &b->w);
-       drbd_set_flag(mdev, CREATE_BARRIER);
+ static void wake_all_senders(struct drbd_tconn *tconn) {
+       wake_up(&tconn->sender_work.q_wait);
  }
  
- static void _about_to_complete_local_write(struct drbd_conf *mdev,
      struct drbd_request *req)
+ /* must hold resource->req_lock */
static void start_new_tl_epoch(struct drbd_tconn *tconn)
  {
-       const unsigned long s = req->rq_state;
-       struct drbd_request *i;
-       struct drbd_epoch_entry *e;
-       struct hlist_node *n;
-       struct hlist_head *slot;
-       /* Before we can signal completion to the upper layers,
-        * we may need to close the current epoch.
-        * We can skip this, if this request has not even been sent, because we
-        * did not have a fully established connection yet/anymore, during
-        * bitmap exchange, or while we are C_AHEAD due to congestion policy.
-        */
-       if (mdev->state.conn >= C_CONNECTED &&
-           (s & RQ_NET_SENT) != 0 &&
-           req->epoch == mdev->newest_tle->br_number)
-               queue_barrier(mdev);
-       /* we need to do the conflict detection stuff,
-        * if we have the ee_hash (two_primaries) and
-        * this has been on the network */
-       if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) {
-               const sector_t sector = req->sector;
-               const int size = req->size;
-               /* ASSERT:
-                * there must be no conflicting requests, since
-                * they must have been failed on the spot */
- #define OVERLAPS overlaps(sector, size, i->sector, i->size)
-               slot = tl_hash_slot(mdev, sector);
-               hlist_for_each_entry(i, n, slot, collision) {
-                       if (OVERLAPS) {
-                               dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
-                                     "other: %p %llus +%u\n",
-                                     req, (unsigned long long)sector, size,
-                                     i, (unsigned long long)i->sector, i->size);
-                       }
-               }
+       /* no point closing an epoch, if it is empty, anyways. */
+       if (tconn->current_tle_writes == 0)
+               return;
  
-               /* maybe "wake" those conflicting epoch entries
-                * that wait for this request to finish.
-                *
-                * currently, there can be only _one_ such ee
-                * (well, or some more, which would be pending
-                * P_DISCARD_ACK not yet sent by the asender...),
-                * since we block the receiver thread upon the
-                * first conflict detection, which will wait on
-                * misc_wait.  maybe we want to assert that?
-                *
-                * anyways, if we found one,
-                * we just have to do a wake_up.  */
- #undef OVERLAPS
- #define OVERLAPS overlaps(sector, size, e->sector, e->size)
-               slot = ee_hash_slot(mdev, req->sector);
-               hlist_for_each_entry(e, n, slot, collision) {
-                       if (OVERLAPS) {
-                               wake_up(&mdev->misc_wait);
-                               break;
-                       }
-               }
-       }
- #undef OVERLAPS
+       tconn->current_tle_writes = 0;
+       atomic_inc(&tconn->current_tle_nr);
+       wake_all_senders(tconn);
  }
  
  void complete_master_bio(struct drbd_conf *mdev,
        dec_ap_bio(mdev);
  }
  
+ static void drbd_remove_request_interval(struct rb_root *root,
+                                        struct drbd_request *req)
+ {
+       struct drbd_conf *mdev = req->w.mdev;
+       struct drbd_interval *i = &req->i;
+       drbd_remove_interval(root, i);
+       /* Wake up any processes waiting for this request to complete.  */
+       if (i->waiting)
+               wake_up(&mdev->misc_wait);
+ }
  /* Helper for __req_mod().
   * Set m->bio to the master bio, if it is fit to be completed,
   * or leave it alone (it is initialized to NULL in __req_mod),
   * if it has already been completed, or cannot be completed yet.
   * If m->bio is set, the error status to be returned is placed in m->error.
   */
- void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
+ static
+ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
  {
-       const unsigned long s = req->rq_state;
-       struct drbd_conf *mdev = req->mdev;
-       int rw = req->rq_state & RQ_WRITE ? WRITE : READ;
+       const unsigned s = req->rq_state;
+       struct drbd_conf *mdev = req->w.mdev;
+       int rw;
+       int error, ok;
  
        /* we must not complete the master bio, while it is
         *      still being processed by _drbd_send_zc_bio (drbd_send_dblock)
         *      the receiver,
         *      the bio_endio completion callbacks.
         */
-       if (s & RQ_NET_QUEUED)
-               return;
-       if (s & RQ_NET_PENDING)
+       if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) ||
+           (s & RQ_NET_QUEUED) || (s & RQ_NET_PENDING) ||
+           (s & RQ_COMPLETION_SUSP)) {
+               dev_err(DEV, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s);
                return;
-       if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED))
+       }
+       if (!req->master_bio) {
+               dev_err(DEV, "drbd_req_complete: Logic BUG, master_bio == NULL!\n");
                return;
+       }
  
-       if (req->master_bio) {
-               /* this is data_received (remote read)
-                * or protocol C P_WRITE_ACK
-                * or protocol B P_RECV_ACK
-                * or protocol A "handed_over_to_network" (SendAck)
-                * or canceled or failed,
-                * or killed from the transfer log due to connection loss.
-                */
+       rw = bio_rw(req->master_bio);
  
-               /*
-                * figure out whether to report success or failure.
-                *
-                * report success when at least one of the operations succeeded.
-                * or, to put the other way,
-                * only report failure, when both operations failed.
-                *
-                * what to do about the failures is handled elsewhere.
-                * what we need to do here is just: complete the master_bio.
-                *
-                * local completion error, if any, has been stored as ERR_PTR
-                * in private_bio within drbd_endio_pri.
-                */
-               int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
-               int error = PTR_ERR(req->private_bio);
+       /*
+        * figure out whether to report success or failure.
+        *
+        * report success when at least one of the operations succeeded.
+        * or, to put the other way,
+        * only report failure, when both operations failed.
+        *
+        * what to do about the failures is handled elsewhere.
+        * what we need to do here is just: complete the master_bio.
+        *
+        * local completion error, if any, has been stored as ERR_PTR
+        * in private_bio within drbd_request_endio.
+        */
+       ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
+       error = PTR_ERR(req->private_bio);
  
-               /* remove the request from the conflict detection
-                * respective block_id verification hash */
-               if (!hlist_unhashed(&req->collision))
-                       hlist_del(&req->collision);
-               else
-                       D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
+       /* remove the request from the conflict detection
+        * respective block_id verification hash */
+       if (!drbd_interval_empty(&req->i)) {
+               struct rb_root *root;
  
-               /* for writes we need to do some extra housekeeping */
                if (rw == WRITE)
-                       _about_to_complete_local_write(mdev, req);
+                       root = &mdev->write_requests;
+               else
+                       root = &mdev->read_requests;
+               drbd_remove_request_interval(root, req);
+       } else if (!(s & RQ_POSTPONED))
+               D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
  
-               /* Update disk stats */
-               _drbd_end_io_acct(mdev, req);
+       /* Before we can signal completion to the upper layers,
+        * we may need to close the current transfer log epoch.
+        * We are within the request lock, so we can simply compare
+        * the request epoch number with the current transfer log
+        * epoch number.  If they match, increase the current_tle_nr,
+        * and reset the transfer log epoch write_cnt.
+        */
+       if (rw == WRITE &&
+           req->epoch == atomic_read(&mdev->tconn->current_tle_nr))
+               start_new_tl_epoch(mdev->tconn);
+       /* Update disk stats */
+       _drbd_end_io_acct(mdev, req);
+       /* If READ failed,
+        * have it be pushed back to the retry work queue,
+        * so it will re-enter __drbd_make_request(),
+        * and be re-assigned to a suitable local or remote path,
+        * or failed if we do not have access to good data anymore.
+        *
+        * Unless it was failed early by __drbd_make_request(),
+        * because no path was available, in which case
+        * it was not even added to the transfer_log.
+        *
+        * READA may fail, and will not be retried.
+        *
+        * WRITE should have used all available paths already.
+        */
+       if (!ok && rw == READ && !list_empty(&req->tl_requests))
+               req->rq_state |= RQ_POSTPONED;
  
+       if (!(req->rq_state & RQ_POSTPONED)) {
                m->error = ok ? 0 : (error ?: -EIO);
                m->bio = req->master_bio;
                req->master_bio = NULL;
        }
+ }
  
-       if (s & RQ_LOCAL_PENDING)
-               return;
+ static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put)
+ {
+       struct drbd_conf *mdev = req->w.mdev;
+       D_ASSERT(m || (req->rq_state & RQ_POSTPONED));
+       if (!atomic_sub_and_test(put, &req->completion_ref))
+               return 0;
  
-       if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
-               /* this is disconnected (local only) operation,
-                * or protocol C P_WRITE_ACK,
-                * or protocol A or B P_BARRIER_ACK,
-                * or killed from the transfer log due to connection loss. */
-               _req_is_done(mdev, req, rw);
+       drbd_req_complete(req, m);
+       if (req->rq_state & RQ_POSTPONED) {
+               /* don't destroy the req object just yet,
+                * but queue it for retry */
+               drbd_restart_request(req);
+               return 0;
        }
-       /* else: network part and not DONE yet. that is
-        * protocol A or B, barrier ack still pending... */
+       return 1;
  }
  
- static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m)
+ /* I'd like this to be the only place that manipulates
+  * req->completion_ref and req->kref. */
+ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
+               int clear, int set)
  {
-       struct drbd_conf *mdev = req->mdev;
+       struct drbd_conf *mdev = req->w.mdev;
+       unsigned s = req->rq_state;
+       int c_put = 0;
+       int k_put = 0;
  
-       if (!is_susp(mdev->state))
-               _req_may_be_done(req, m);
- }
+       if (drbd_suspended(mdev) && !((s | clear) & RQ_COMPLETION_SUSP))
+               set |= RQ_COMPLETION_SUSP;
  
- /*
-  * checks whether there was an overlapping request
-  * or ee already registered.
-  *
-  * if so, return 1, in which case this request is completed on the spot,
-  * without ever being submitted or send.
-  *
-  * return 0 if it is ok to submit this request.
-  *
-  * NOTE:
-  * paranoia: assume something above us is broken, and issues different write
-  * requests for the same block simultaneously...
-  *
-  * To ensure these won't be reordered differently on both nodes, resulting in
-  * diverging data sets, we discard the later one(s). Not that this is supposed
-  * to happen, but this is the rationale why we also have to check for
-  * conflicting requests with local origin, and why we have to do so regardless
-  * of whether we allowed multiple primaries.
-  *
-  * BTW, in case we only have one primary, the ee_hash is empty anyways, and the
-  * second hlist_for_each_entry becomes a noop. This is even simpler than to
-  * grab a reference on the net_conf, and check for the two_primaries flag...
-  */
- static int _req_conflicts(struct drbd_request *req)
- {
-       struct drbd_conf *mdev = req->mdev;
-       const sector_t sector = req->sector;
-       const int size = req->size;
-       struct drbd_request *i;
-       struct drbd_epoch_entry *e;
-       struct hlist_node *n;
-       struct hlist_head *slot;
+       /* apply */
  
-       D_ASSERT(hlist_unhashed(&req->collision));
+       req->rq_state &= ~clear;
+       req->rq_state |= set;
  
-       if (!get_net_conf(mdev))
-               return 0;
+       /* no change? */
+       if (req->rq_state == s)
+               return;
  
-       /* BUG_ON */
-       ERR_IF (mdev->tl_hash_s == 0)
-               goto out_no_conflict;
-       BUG_ON(mdev->tl_hash == NULL);
- #define OVERLAPS overlaps(i->sector, i->size, sector, size)
-       slot = tl_hash_slot(mdev, sector);
-       hlist_for_each_entry(i, n, slot, collision) {
-               if (OVERLAPS) {
-                       dev_alert(DEV, "%s[%u] Concurrent local write detected! "
-                             "[DISCARD L] new: %llus +%u; "
-                             "pending: %llus +%u\n",
-                             current->comm, current->pid,
-                             (unsigned long long)sector, size,
-                             (unsigned long long)i->sector, i->size);
-                       goto out_conflict;
-               }
+       /* intent: get references */
+       if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING))
+               atomic_inc(&req->completion_ref);
+       if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) {
+               inc_ap_pending(mdev);
+               atomic_inc(&req->completion_ref);
        }
  
-       if (mdev->ee_hash_s) {
-               /* now, check for overlapping requests with remote origin */
-               BUG_ON(mdev->ee_hash == NULL);
- #undef OVERLAPS
- #define OVERLAPS overlaps(e->sector, e->size, sector, size)
-               slot = ee_hash_slot(mdev, sector);
-               hlist_for_each_entry(e, n, slot, collision) {
-                       if (OVERLAPS) {
-                               dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
-                                     " [DISCARD L] new: %llus +%u; "
-                                     "pending: %llus +%u\n",
-                                     current->comm, current->pid,
-                                     (unsigned long long)sector, size,
-                                     (unsigned long long)e->sector, e->size);
-                               goto out_conflict;
-                       }
-               }
+       if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED))
+               atomic_inc(&req->completion_ref);
+       if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
+               kref_get(&req->kref); /* wait for the DONE */
+       if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT))
+               atomic_add(req->i.size >> 9, &mdev->ap_in_flight);
+       if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
+               atomic_inc(&req->completion_ref);
+       /* progress: put references */
+       if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP))
+               ++c_put;
+       if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) {
+               D_ASSERT(req->rq_state & RQ_LOCAL_PENDING);
+               /* local completion may still come in later,
+                * we need to keep the req object around. */
+               kref_get(&req->kref);
+               ++c_put;
        }
- #undef OVERLAPS
  
- out_no_conflict:
-       /* this is like it should be, and what we expected.
-        * our users do behave after all... */
-       put_net_conf(mdev);
-       return 0;
+       if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) {
+               if (req->rq_state & RQ_LOCAL_ABORTED)
+                       ++k_put;
+               else
+                       ++c_put;
+       }
  
- out_conflict:
-       put_net_conf(mdev);
-       return 1;
+       if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
+               dec_ap_pending(mdev);
+               ++c_put;
+       }
+       if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED))
+               ++c_put;
+       if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
+               if (req->rq_state & RQ_NET_SENT)
+                       atomic_sub(req->i.size >> 9, &mdev->ap_in_flight);
+               ++k_put;
+       }
+       /* potentially complete and destroy */
+       if (k_put || c_put) {
+               /* Completion does it's own kref_put.  If we are going to
+                * kref_sub below, we need req to be still around then. */
+               int at_least = k_put + !!c_put;
+               int refcount = atomic_read(&req->kref.refcount);
+               if (refcount < at_least)
+                       dev_err(DEV,
+                               "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
+                               s, req->rq_state, refcount, at_least);
+       }
+       /* If we made progress, retry conflicting peer requests, if any. */
+       if (req->i.waiting)
+               wake_up(&mdev->misc_wait);
+       if (c_put)
+               k_put += drbd_req_put_completion_ref(req, m, c_put);
+       if (k_put)
+               kref_sub(&req->kref, k_put, drbd_req_destroy);
  }
  
  static void drbd_report_io_error(struct drbd_conf *mdev, struct drbd_request *req)
  {
          char b[BDEVNAME_SIZE];
  
-       if (__ratelimit(&drbd_ratelimit_state))
+       if (!__ratelimit(&drbd_ratelimit_state))
                return;
  
        dev_warn(DEV, "local %s IO error sector %llu+%u on %s\n",
                        (req->rq_state & RQ_WRITE) ? "WRITE" : "READ",
-                       (unsigned long long)req->sector,
-                       req->size >> 9,
+                       (unsigned long long)req->i.sector,
+                       req->i.size >> 9,
                        bdevname(mdev->ldev->backing_bdev, b));
  }
  
  int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                struct bio_and_error *m)
  {
-       struct drbd_conf *mdev = req->mdev;
-       int rv = 0;
-       m->bio = NULL;
+       struct drbd_conf *mdev = req->w.mdev;
+       struct net_conf *nc;
+       int p, rv = 0;
+       if (m)
+               m->bio = NULL;
  
        switch (what) {
        default:
  
        /* does not happen...
         * initialization done in drbd_req_new
-       case created:
+       case CREATED:
                break;
                */
  
-       case to_be_send: /* via network */
-               /* reached via drbd_make_request_common
+       case TO_BE_SENT: /* via network */
+               /* reached via __drbd_make_request
                 * and from w_read_retry_remote */
                D_ASSERT(!(req->rq_state & RQ_NET_MASK));
-               req->rq_state |= RQ_NET_PENDING;
-               inc_ap_pending(mdev);
+               rcu_read_lock();
+               nc = rcu_dereference(mdev->tconn->net_conf);
+               p = nc->wire_protocol;
+               rcu_read_unlock();
+               req->rq_state |=
+                       p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK :
+                       p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0;
+               mod_rq_state(req, m, 0, RQ_NET_PENDING);
                break;
  
-       case to_be_submitted: /* locally */
-               /* reached via drbd_make_request_common */
+       case TO_BE_SUBMITTED: /* locally */
+               /* reached via __drbd_make_request */
                D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
-               req->rq_state |= RQ_LOCAL_PENDING;
+               mod_rq_state(req, m, 0, RQ_LOCAL_PENDING);
                break;
  
-       case completed_ok:
+       case COMPLETED_OK:
                if (req->rq_state & RQ_WRITE)
-                       mdev->writ_cnt += req->size>>9;
+                       mdev->writ_cnt += req->i.size >> 9;
                else
-                       mdev->read_cnt += req->size>>9;
+                       mdev->read_cnt += req->i.size >> 9;
  
-               req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
-               req->rq_state &= ~RQ_LOCAL_PENDING;
-               _req_may_be_done_not_susp(req, m);
+               mod_rq_state(req, m, RQ_LOCAL_PENDING,
+                               RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
                break;
  
-       case abort_disk_io:
-               req->rq_state |= RQ_LOCAL_ABORTED;
-               if (req->rq_state & RQ_WRITE)
-                       _req_may_be_done_not_susp(req, m);
-               else
-                       goto goto_queue_for_net_read;
+       case ABORT_DISK_IO:
+               mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED);
                break;
  
-       case write_completed_with_error:
-               req->rq_state |= RQ_LOCAL_COMPLETED;
-               req->rq_state &= ~RQ_LOCAL_PENDING;
+       case WRITE_COMPLETED_WITH_ERROR:
                drbd_report_io_error(mdev, req);
                __drbd_chk_io_error(mdev, DRBD_WRITE_ERROR);
-               _req_may_be_done_not_susp(req, m);
+               mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
                break;
  
-       case read_ahead_completed_with_error:
-               /* it is legal to fail READA */
-               req->rq_state |= RQ_LOCAL_COMPLETED;
-               req->rq_state &= ~RQ_LOCAL_PENDING;
-               _req_may_be_done_not_susp(req, m);
-               break;
-       case read_completed_with_error:
-               drbd_set_out_of_sync(mdev, req->sector, req->size);
-               req->rq_state |= RQ_LOCAL_COMPLETED;
-               req->rq_state &= ~RQ_LOCAL_PENDING;
-               if (req->rq_state & RQ_LOCAL_ABORTED) {
-                       _req_may_be_done(req, m);
-                       break;
-               }
+       case READ_COMPLETED_WITH_ERROR:
+               drbd_set_out_of_sync(mdev, req->i.sector, req->i.size);
                drbd_report_io_error(mdev, req);
                __drbd_chk_io_error(mdev, DRBD_READ_ERROR);
+               /* fall through. */
+       case READ_AHEAD_COMPLETED_WITH_ERROR:
+               /* it is legal to fail READA, no __drbd_chk_io_error in that case. */
+               mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+               break;
  
-       goto_queue_for_net_read:
-               D_ASSERT(!(req->rq_state & RQ_NET_MASK));
-               /* no point in retrying if there is no good remote data,
-                * or we have no connection. */
-               if (mdev->state.pdsk != D_UP_TO_DATE) {
-                       _req_may_be_done_not_susp(req, m);
-                       break;
-               }
-               /* _req_mod(req,to_be_send); oops, recursion... */
-               req->rq_state |= RQ_NET_PENDING;
-               inc_ap_pending(mdev);
-               /* fall through: _req_mod(req,queue_for_net_read); */
-       case queue_for_net_read:
+       case QUEUE_FOR_NET_READ:
                /* READ or READA, and
                 * no local disk,
                 * or target area marked as invalid,
                 * or just got an io-error. */
-               /* from drbd_make_request_common
+               /* from __drbd_make_request
                 * or from bio_endio during read io-error recovery */
  
-               /* so we can verify the handle in the answer packet
-                * corresponding hlist_del is in _req_may_be_done() */
-               hlist_add_head(&req->collision, ar_hash_slot(mdev, req->sector));
+               /* So we can verify the handle in the answer packet.
+                * Corresponding drbd_remove_request_interval is in
+                * drbd_req_complete() */
+               D_ASSERT(drbd_interval_empty(&req->i));
+               drbd_insert_interval(&mdev->read_requests, &req->i);
  
-               drbd_set_flag(mdev, UNPLUG_REMOTE);
+               set_bit(UNPLUG_REMOTE, &mdev->flags);
  
                D_ASSERT(req->rq_state & RQ_NET_PENDING);
-               req->rq_state |= RQ_NET_QUEUED;
-               req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
-                       ? w_read_retry_remote
-                       : w_send_read_req;
-               drbd_queue_work(&mdev->data.work, &req->w);
+               D_ASSERT((req->rq_state & RQ_LOCAL_MASK) == 0);
+               mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+               req->w.cb = w_send_read_req;
+               drbd_queue_work(&mdev->tconn->sender_work, &req->w);
                break;
  
-       case queue_for_net_write:
+       case QUEUE_FOR_NET_WRITE:
                /* assert something? */
-               /* from drbd_make_request_common only */
+               /* from __drbd_make_request only */
  
-               hlist_add_head(&req->collision, tl_hash_slot(mdev, req->sector));
-               /* corresponding hlist_del is in _req_may_be_done() */
+               /* Corresponding drbd_remove_request_interval is in
+                * drbd_req_complete() */
+               D_ASSERT(drbd_interval_empty(&req->i));
+               drbd_insert_interval(&mdev->write_requests, &req->i);
  
                /* NOTE
                 * In case the req ended up on the transfer log before being
                 *
                 * _req_add_to_epoch(req); this has to be after the
                 * _maybe_start_new_epoch(req); which happened in
-                * drbd_make_request_common, because we now may set the bit
+                * __drbd_make_request, because we now may set the bit
                 * again ourselves to close the current epoch.
                 *
                 * Add req to the (now) current epoch (barrier). */
                /* otherwise we may lose an unplug, which may cause some remote
                 * io-scheduler timeout to expire, increasing maximum latency,
                 * hurting performance. */
-               drbd_set_flag(mdev, UNPLUG_REMOTE);
-               /* see drbd_make_request_common,
-                * just after it grabs the req_lock */
-               D_ASSERT(drbd_test_flag(mdev, CREATE_BARRIER) == 0);
-               req->epoch = mdev->newest_tle->br_number;
-               /* increment size of current epoch */
-               mdev->newest_tle->n_writes++;
+               set_bit(UNPLUG_REMOTE, &mdev->flags);
  
                /* queue work item to send data */
                D_ASSERT(req->rq_state & RQ_NET_PENDING);
-               req->rq_state |= RQ_NET_QUEUED;
+               mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK);
                req->w.cb =  w_send_dblock;
-               drbd_queue_work(&mdev->data.work, &req->w);
+               drbd_queue_work(&mdev->tconn->sender_work, &req->w);
  
                /* close the epoch, in case it outgrew the limit */
-               if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size)
-                       queue_barrier(mdev);
+               rcu_read_lock();
+               nc = rcu_dereference(mdev->tconn->net_conf);
+               p = nc->max_epoch_size;
+               rcu_read_unlock();
+               if (mdev->tconn->current_tle_writes >= p)
+                       start_new_tl_epoch(mdev->tconn);
  
                break;
  
-       case queue_for_send_oos:
-               req->rq_state |= RQ_NET_QUEUED;
-               req->w.cb =  w_send_oos;
-               drbd_queue_work(&mdev->data.work, &req->w);
+       case QUEUE_FOR_SEND_OOS:
+               mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+               req->w.cb =  w_send_out_of_sync;
+               drbd_queue_work(&mdev->tconn->sender_work, &req->w);
                break;
  
-       case read_retry_remote_canceled:
-       case send_canceled:
-       case send_failed:
+       case READ_RETRY_REMOTE_CANCELED:
+       case SEND_CANCELED:
+       case SEND_FAILED:
                /* real cleanup will be done from tl_clear.  just update flags
                 * so it is no longer marked as on the worker queue */
-               req->rq_state &= ~RQ_NET_QUEUED;
-               /* if we did it right, tl_clear should be scheduled only after
-                * this, so this should not be necessary! */
-               _req_may_be_done_not_susp(req, m);
+               mod_rq_state(req, m, RQ_NET_QUEUED, 0);
                break;
  
-       case handed_over_to_network:
+       case HANDED_OVER_TO_NETWORK:
                /* assert something? */
-               if (bio_data_dir(req->master_bio) == WRITE)
-                       atomic_add(req->size>>9, &mdev->ap_in_flight);
                if (bio_data_dir(req->master_bio) == WRITE &&
-                   mdev->net_conf->wire_protocol == DRBD_PROT_A) {
+                   !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) {
                        /* this is what is dangerous about protocol A:
                         * pretend it was successfully written on the peer. */
-                       if (req->rq_state & RQ_NET_PENDING) {
-                               dec_ap_pending(mdev);
-                               req->rq_state &= ~RQ_NET_PENDING;
-                               req->rq_state |= RQ_NET_OK;
-                       } /* else: neg-ack was faster... */
+                       if (req->rq_state & RQ_NET_PENDING)
+                               mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
+                       /* else: neg-ack was faster... */
                        /* it is still not yet RQ_NET_DONE until the
                         * corresponding epoch barrier got acked as well,
                         * so we know what to dirty on connection loss */
                }
-               req->rq_state &= ~RQ_NET_QUEUED;
-               req->rq_state |= RQ_NET_SENT;
-               _req_may_be_done_not_susp(req, m);
+               mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
                break;
  
-       case oos_handed_to_network:
+       case OOS_HANDED_TO_NETWORK:
                /* Was not set PENDING, no longer QUEUED, so is now DONE
                 * as far as this connection is concerned. */
-               req->rq_state &= ~RQ_NET_QUEUED;
-               req->rq_state |= RQ_NET_DONE;
-               _req_may_be_done_not_susp(req, m);
+               mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE);
                break;
  
-       case connection_lost_while_pending:
+       case CONNECTION_LOST_WHILE_PENDING:
                /* transfer log cleanup after connection loss */
-               /* assert something? */
-               if (req->rq_state & RQ_NET_PENDING)
-                       dec_ap_pending(mdev);
-               req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
-               req->rq_state |= RQ_NET_DONE;
-               if (req->rq_state & RQ_NET_SENT && req->rq_state & RQ_WRITE)
-                       atomic_sub(req->size>>9, &mdev->ap_in_flight);
-               /* if it is still queued, we may not complete it here.
-                * it will be canceled soon. */
-               if (!(req->rq_state & RQ_NET_QUEUED))
-                       _req_may_be_done(req, m); /* Allowed while state.susp */
+               mod_rq_state(req, m,
+                               RQ_NET_OK|RQ_NET_PENDING|RQ_COMPLETION_SUSP,
+                               RQ_NET_DONE);
                break;
  
-       case conflict_discarded_by_peer:
-               /* for discarded conflicting writes of multiple primaries,
+       case CONFLICT_RESOLVED:
+               /* for superseded conflicting writes of multiple primaries,
                 * there is no need to keep anything in the tl, potential
-                * node crashes are covered by the activity log. */
-               if (what == conflict_discarded_by_peer)
-                       dev_alert(DEV, "Got DiscardAck packet %llus +%u!"
-                             " DRBD is not a random data generator!\n",
-                             (unsigned long long)req->sector, req->size);
-               req->rq_state |= RQ_NET_DONE;
-               /* fall through */
-       case write_acked_by_peer_and_sis:
-       case write_acked_by_peer:
-               if (what == write_acked_by_peer_and_sis)
-                       req->rq_state |= RQ_NET_SIS;
+                * node crashes are covered by the activity log.
+                *
+                * If this request had been marked as RQ_POSTPONED before,
+                * it will actually not be completed, but "restarted",
+                * resubmitted from the retry worker context. */
+               D_ASSERT(req->rq_state & RQ_NET_PENDING);
+               D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK);
+               mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE|RQ_NET_OK);
+               break;
+       case WRITE_ACKED_BY_PEER_AND_SIS:
+               req->rq_state |= RQ_NET_SIS;
+       case WRITE_ACKED_BY_PEER:
+               D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK);
                /* protocol C; successfully written on peer.
                 * Nothing more to do here.
                 * We want to keep the tl in place for all protocols, to cater
                 * for volatile write-back caches on lower level devices. */
  
-       case recv_acked_by_peer:
+               goto ack_common;
+       case RECV_ACKED_BY_PEER:
+               D_ASSERT(req->rq_state & RQ_EXP_RECEIVE_ACK);
                /* protocol B; pretends to be successfully written on peer.
-                * see also notes above in handed_over_to_network about
+                * see also notes above in HANDED_OVER_TO_NETWORK about
                 * protocol != C */
-               req->rq_state |= RQ_NET_OK;
+       ack_common:
                D_ASSERT(req->rq_state & RQ_NET_PENDING);
-               dec_ap_pending(mdev);
-               atomic_sub(req->size>>9, &mdev->ap_in_flight);
-               req->rq_state &= ~RQ_NET_PENDING;
-               _req_may_be_done_not_susp(req, m);
+               mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
                break;
  
-       case neg_acked:
-               /* assert something? */
-               if (req->rq_state & RQ_NET_PENDING) {
-                       dec_ap_pending(mdev);
-                       atomic_sub(req->size>>9, &mdev->ap_in_flight);
-               }
-               req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
+       case POSTPONE_WRITE:
+               D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK);
+               /* If this node has already detected the write conflict, the
+                * worker will be waiting on misc_wait.  Wake it up once this
+                * request has completed locally.
+                */
+               D_ASSERT(req->rq_state & RQ_NET_PENDING);
+               req->rq_state |= RQ_POSTPONED;
+               if (req->i.waiting)
+                       wake_up(&mdev->misc_wait);
+               /* Do not clear RQ_NET_PENDING. This request will make further
+                * progress via restart_conflicting_writes() or
+                * fail_postponed_requests(). Hopefully. */
+               break;
  
-               req->rq_state |= RQ_NET_DONE;
-               _req_may_be_done_not_susp(req, m);
-               /* else: done by handed_over_to_network */
+       case NEG_ACKED:
+               mod_rq_state(req, m, RQ_NET_OK|RQ_NET_PENDING, 0);
                break;
  
-       case fail_frozen_disk_io:
+       case FAIL_FROZEN_DISK_IO:
                if (!(req->rq_state & RQ_LOCAL_COMPLETED))
                        break;
-               _req_may_be_done(req, m); /* Allowed while state.susp */
+               mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
                break;
  
-       case restart_frozen_disk_io:
+       case RESTART_FROZEN_DISK_IO:
                if (!(req->rq_state & RQ_LOCAL_COMPLETED))
                        break;
  
-               req->rq_state &= ~RQ_LOCAL_COMPLETED;
+               mod_rq_state(req, m,
+                               RQ_COMPLETION_SUSP|RQ_LOCAL_COMPLETED,
+                               RQ_LOCAL_PENDING);
  
                rv = MR_READ;
                if (bio_data_dir(req->master_bio) == WRITE)
                        rv = MR_WRITE;
  
-               get_ldev(mdev);
+               get_ldev(mdev); /* always succeeds in this call path */
                req->w.cb = w_restart_disk_io;
-               drbd_queue_work(&mdev->data.work, &req->w);
+               drbd_queue_work(&mdev->tconn->sender_work, &req->w);
                break;
  
-       case resend:
+       case RESEND:
                /* Simply complete (local only) READs. */
                if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
-                       _req_may_be_done(req, m);
+                       mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
                        break;
                }
  
                /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
-                  before the connection loss (B&C only); only P_BARRIER_ACK was missing.
-                  Trowing them out of the TL here by pretending we got a BARRIER_ACK
-                  We ensure that the peer was not rebooted */
+                  before the connection loss (B&C only); only P_BARRIER_ACK
+                  (or the local completion?) was missing when we suspended.
+                  Throwing them out of the TL here by pretending we got a BARRIER_ACK.
+                  During connection handshake, we ensure that the peer was not rebooted. */
                if (!(req->rq_state & RQ_NET_OK)) {
+                       /* FIXME could this possibly be a req->w.cb == w_send_out_of_sync?
+                        * in that case we must not set RQ_NET_PENDING. */
+                       mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING);
                        if (req->w.cb) {
-                               drbd_queue_work(&mdev->data.work, &req->w);
+                               drbd_queue_work(&mdev->tconn->sender_work, &req->w);
                                rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
-                       }
+                       } /* else: FIXME can this happen? */
                        break;
                }
-               /* else, fall through to barrier_acked */
+               /* else, fall through to BARRIER_ACKED */
  
-       case barrier_acked:
+       case BARRIER_ACKED:
+               /* barrier ack for READ requests does not make sense */
                if (!(req->rq_state & RQ_WRITE))
                        break;
  
                if (req->rq_state & RQ_NET_PENDING) {
-                       /* barrier came in before all requests have been acked.
+                       /* barrier came in before all requests were acked.
                         * this is bad, because if the connection is lost now,
                         * we won't be able to clean them up... */
-                       dev_err(DEV, "FIXME (barrier_acked but pending)\n");
-                       list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
+                       dev_err(DEV, "FIXME (BARRIER_ACKED but pending)\n");
                }
-               if ((req->rq_state & RQ_NET_MASK) != 0) {
-                       req->rq_state |= RQ_NET_DONE;
-                       if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
-                               atomic_sub(req->size>>9, &mdev->ap_in_flight);
-               }
-               _req_may_be_done(req, m); /* Allowed while state.susp */
+               /* Allowed to complete requests, even while suspended.
+                * As this is called for all requests within a matching epoch,
+                * we need to filter, and only set RQ_NET_DONE for those that
+                * have actually been on the wire. */
+               mod_rq_state(req, m, RQ_COMPLETION_SUSP,
+                               (req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0);
                break;
  
-       case data_received:
+       case DATA_RECEIVED:
                D_ASSERT(req->rq_state & RQ_NET_PENDING);
-               dec_ap_pending(mdev);
-               req->rq_state &= ~RQ_NET_PENDING;
-               req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
-               _req_may_be_done_not_susp(req, m);
+               mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE);
                break;
        };
  
   *   since size may be bigger than BM_BLOCK_SIZE,
   *   we may need to check several bits.
   */
- static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
+ static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
  {
        unsigned long sbnr, ebnr;
        sector_t esector, nr_sectors;
  
        if (mdev->state.disk == D_UP_TO_DATE)
-               return 1;
-       if (mdev->state.disk >= D_OUTDATED)
-               return 0;
-       if (mdev->state.disk <  D_INCONSISTENT)
-               return 0;
-       /* state.disk == D_INCONSISTENT   We will have a look at the BitMap */
-       nr_sectors = drbd_get_capacity(mdev->this_bdev);
+               return true;
+       if (mdev->state.disk != D_INCONSISTENT)
+               return false;
        esector = sector + (size >> 9) - 1;
+       nr_sectors = drbd_get_capacity(mdev->this_bdev);
        D_ASSERT(sector  < nr_sectors);
        D_ASSERT(esector < nr_sectors);
  
        sbnr = BM_SECT_TO_BIT(sector);
        ebnr = BM_SECT_TO_BIT(esector);
  
-       return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
+       return drbd_bm_count_bits(mdev, sbnr, ebnr) == 0;
+ }
+ static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector,
+               enum drbd_read_balancing rbm)
+ {
+       struct backing_dev_info *bdi;
+       int stripe_shift;
+       switch (rbm) {
+       case RB_CONGESTED_REMOTE:
+               bdi = &mdev->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
+               return bdi_read_congested(bdi);
+       case RB_LEAST_PENDING:
+               return atomic_read(&mdev->local_cnt) >
+                       atomic_read(&mdev->ap_pending_cnt) + atomic_read(&mdev->rs_pending_cnt);
+       case RB_32K_STRIPING:  /* stripe_shift = 15 */
+       case RB_64K_STRIPING:
+       case RB_128K_STRIPING:
+       case RB_256K_STRIPING:
+       case RB_512K_STRIPING:
+       case RB_1M_STRIPING:   /* stripe_shift = 20 */
+               stripe_shift = (rbm - RB_32K_STRIPING + 15);
+               return (sector >> (stripe_shift - 9)) & 1;
+       case RB_ROUND_ROBIN:
+               return test_and_change_bit(READ_BALANCE_RR, &mdev->flags);
+       case RB_PREFER_REMOTE:
+               return true;
+       case RB_PREFER_LOCAL:
+       default:
+               return false;
+       }
+ }
+ /*
+  * complete_conflicting_writes  -  wait for any conflicting write requests
+  *
+  * The write_requests tree contains all active write requests which we
+  * currently know about.  Wait for any requests to complete which conflict with
+  * the new one.
+  *
+  * Only way out: remove the conflicting intervals from the tree.
+  */
+ static void complete_conflicting_writes(struct drbd_request *req)
+ {
+       DEFINE_WAIT(wait);
+       struct drbd_conf *mdev = req->w.mdev;
+       struct drbd_interval *i;
+       sector_t sector = req->i.sector;
+       int size = req->i.size;
+       i = drbd_find_overlap(&mdev->write_requests, sector, size);
+       if (!i)
+               return;
+       for (;;) {
+               prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
+               i = drbd_find_overlap(&mdev->write_requests, sector, size);
+               if (!i)
+                       break;
+               /* Indicate to wake up device->misc_wait on progress.  */
+               i->waiting = true;
+               spin_unlock_irq(&mdev->tconn->req_lock);
+               schedule();
+               spin_lock_irq(&mdev->tconn->req_lock);
+       }
+       finish_wait(&mdev->misc_wait, &wait);
  }
  
+ /* called within req_lock and rcu_read_lock() */
  static void maybe_pull_ahead(struct drbd_conf *mdev)
  {
-       int congested = 0;
+       struct drbd_tconn *tconn = mdev->tconn;
+       struct net_conf *nc;
+       bool congested = false;
+       enum drbd_on_congestion on_congestion;
+       nc = rcu_dereference(tconn->net_conf);
+       on_congestion = nc ? nc->on_congestion : OC_BLOCK;
+       if (on_congestion == OC_BLOCK ||
+           tconn->agreed_pro_version < 96)
+               return;
  
        /* If I don't even have good local storage, we can not reasonably try
         * to pull ahead of the peer. We also need the local reference to make
         * sure mdev->act_log is there.
-        * Note: caller has to make sure that net_conf is there.
         */
        if (!get_ldev_if_state(mdev, D_UP_TO_DATE))
                return;
  
-       if (mdev->net_conf->cong_fill &&
-           atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) {
+       if (nc->cong_fill &&
+           atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) {
                dev_info(DEV, "Congestion-fill threshold reached\n");
-               congested = 1;
+               congested = true;
        }
  
-       if (mdev->act_log->used >= mdev->net_conf->cong_extents) {
+       if (mdev->act_log->used >= nc->cong_extents) {
                dev_info(DEV, "Congestion-extents threshold reached\n");
-               congested = 1;
+               congested = true;
        }
  
        if (congested) {
-               queue_barrier(mdev); /* last barrier, after mirrored writes */
+               /* start a new epoch for non-mirrored writes */
+               start_new_tl_epoch(mdev->tconn);
  
-               if (mdev->net_conf->on_congestion == OC_PULL_AHEAD)
+               if (on_congestion == OC_PULL_AHEAD)
                        _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
-               else  /*mdev->net_conf->on_congestion == OC_DISCONNECT */
+               else  /*nc->on_congestion == OC_DISCONNECT */
                        _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
        }
        put_ldev(mdev);
  }
  
- static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
+ /* If this returns false, and req->private_bio is still set,
+  * this should be submitted locally.
+  *
+  * If it returns false, but req->private_bio is not set,
+  * we do not have access to good data :(
+  *
+  * Otherwise, this destroys req->private_bio, if any,
+  * and returns true.
+  */
+ static bool do_remote_read(struct drbd_request *req)
+ {
+       struct drbd_conf *mdev = req->w.mdev;
+       enum drbd_read_balancing rbm;
+       if (req->private_bio) {
+               if (!drbd_may_do_local_read(mdev,
+                                       req->i.sector, req->i.size)) {
+                       bio_put(req->private_bio);
+                       req->private_bio = NULL;
+                       put_ldev(mdev);
+               }
+       }
+       if (mdev->state.pdsk != D_UP_TO_DATE)
+               return false;
+       if (req->private_bio == NULL)
+               return true;
+       /* TODO: improve read balancing decisions, take into account drbd
+        * protocol, pending requests etc. */
+       rcu_read_lock();
+       rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing;
+       rcu_read_unlock();
+       if (rbm == RB_PREFER_LOCAL && req->private_bio)
+               return false; /* submit locally */
+       if (remote_due_to_read_balancing(mdev, req->i.sector, rbm)) {
+               if (req->private_bio) {
+                       bio_put(req->private_bio);
+                       req->private_bio = NULL;
+                       put_ldev(mdev);
+               }
+               return true;
+       }
+       return false;
+ }
+ /* returns number of connections (== 1, for drbd 8.4)
+  * expected to actually write this data,
+  * which does NOT include those that we are L_AHEAD for. */
+ static int drbd_process_write_request(struct drbd_request *req)
+ {
+       struct drbd_conf *mdev = req->w.mdev;
+       int remote, send_oos;
+       rcu_read_lock();
+       remote = drbd_should_do_remote(mdev->state);
+       if (remote) {
+               maybe_pull_ahead(mdev);
+               remote = drbd_should_do_remote(mdev->state);
+       }
+       send_oos = drbd_should_send_out_of_sync(mdev->state);
+       rcu_read_unlock();
+       /* Need to replicate writes.  Unless it is an empty flush,
+        * which is better mapped to a DRBD P_BARRIER packet,
+        * also for drbd wire protocol compatibility reasons.
+        * If this was a flush, just start a new epoch.
+        * Unless the current epoch was empty anyways, or we are not currently
+        * replicating, in which case there is no point. */
+       if (unlikely(req->i.size == 0)) {
+               /* The only size==0 bios we expect are empty flushes. */
+               D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH);
+               if (remote)
+                       start_new_tl_epoch(mdev->tconn);
+               return 0;
+       }
+       if (!remote && !send_oos)
+               return 0;
+       D_ASSERT(!(remote && send_oos));
+       if (remote) {
+               _req_mod(req, TO_BE_SENT);
+               _req_mod(req, QUEUE_FOR_NET_WRITE);
+       } else if (drbd_set_out_of_sync(mdev, req->i.sector, req->i.size))
+               _req_mod(req, QUEUE_FOR_SEND_OOS);
+       return remote;
+ }
+ static void
+ drbd_submit_req_private_bio(struct drbd_request *req)
+ {
+       struct drbd_conf *mdev = req->w.mdev;
+       struct bio *bio = req->private_bio;
+       const int rw = bio_rw(bio);
+       bio->bi_bdev = mdev->ldev->backing_bdev;
+       /* State may have changed since we grabbed our reference on the
+        * ->ldev member. Double check, and short-circuit to endio.
+        * In case the last activity log transaction failed to get on
+        * stable storage, and this is a WRITE, we may not even submit
+        * this bio. */
+       if (get_ldev(mdev)) {
+               if (drbd_insert_fault(mdev,
+                                     rw == WRITE ? DRBD_FAULT_DT_WR
+                                   : rw == READ  ? DRBD_FAULT_DT_RD
+                                   :               DRBD_FAULT_DT_RA))
+                       bio_endio(bio, -EIO);
+               else
+                       generic_make_request(bio);
+               put_ldev(mdev);
+       } else
+               bio_endio(bio, -EIO);
+ }
+ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
  {
        const int rw = bio_rw(bio);
-       const int size = bio->bi_size;
-       const sector_t sector = bio->bi_sector;
-       struct drbd_tl_epoch *b = NULL;
+       struct bio_and_error m = { NULL, };
        struct drbd_request *req;
-       int local, remote, send_oos = 0;
-       int err = -EIO;
-       int ret = 0;
-       union drbd_state s;
+       bool no_remote = false;
  
        /* allocate outside of all locks; */
        req = drbd_req_new(mdev, bio);
                 * if user cannot handle io errors, that's not our business. */
                dev_err(DEV, "could not kmalloc() req\n");
                bio_endio(bio, -ENOMEM);
-               return 0;
+               return;
        }
        req->start_time = start_time;
  
-       local = get_ldev(mdev);
-       if (!local) {
-               bio_put(req->private_bio); /* or we get a bio leak */
+       if (!get_ldev(mdev)) {
+               bio_put(req->private_bio);
                req->private_bio = NULL;
        }
-       if (rw == WRITE) {
-               /* Need to replicate writes.  Unless it is an empty flush,
-                * which is better mapped to a DRBD P_BARRIER packet,
-                * also for drbd wire protocol compatibility reasons. */
-               if (unlikely(size == 0)) {
-                       /* The only size==0 bios we expect are empty flushes. */
-                       D_ASSERT(bio->bi_rw & REQ_FLUSH);
-                       remote = 0;
-               } else
-                       remote = 1;
-       } else {
-               /* READ || READA */
-               if (local) {
-                       if (!drbd_may_do_local_read(mdev, sector, size)) {
-                               /* we could kick the syncer to
-                                * sync this extent asap, wait for
-                                * it, then continue locally.
-                                * Or just issue the request remotely.
-                                */
-                               local = 0;
-                               bio_put(req->private_bio);
-                               req->private_bio = NULL;
-                               put_ldev(mdev);
-                       }
-               }
-               remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
-       }
-       /* If we have a disk, but a READA request is mapped to remote,
-        * we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
-        * Just fail that READA request right here.
-        *
-        * THINK: maybe fail all READA when not local?
-        *        or make this configurable...
-        *        if network is slow, READA won't do any good.
-        */
-       if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
-               err = -EWOULDBLOCK;
-               goto fail_and_free_req;
-       }
  
        /* For WRITES going to the local disk, grab a reference on the target
         * extent.  This waits for any resync activity in the corresponding
         * of transactional on-disk meta data updates.
         * Empty flushes don't need to go into the activity log, they can only
         * flush data for pending writes which are already in there. */
-       if (rw == WRITE && local && size
-       && !drbd_test_flag(mdev, AL_SUSPENDED)) {
+       if (rw == WRITE && req->private_bio && req->i.size
+       && !test_bit(AL_SUSPENDED, &mdev->flags)) {
                req->rq_state |= RQ_IN_ACT_LOG;
-               drbd_al_begin_io(mdev, sector);
+               drbd_al_begin_io(mdev, &req->i);
        }
  
-       s = mdev->state;
-       remote = remote && drbd_should_do_remote(s);
-       send_oos = rw == WRITE && drbd_should_send_oos(s);
-       D_ASSERT(!(remote && send_oos));
-       if (!(local || remote) && !is_susp(mdev->state)) {
-               if (__ratelimit(&drbd_ratelimit_state))
-                       dev_err(DEV, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
-                                       (unsigned long long)req->sector, req->size >> 9);
-               goto fail_free_complete;
-       }
-       /* For WRITE request, we have to make sure that we have an
-        * unused_spare_tle, in case we need to start a new epoch.
-        * I try to be smart and avoid to pre-allocate always "just in case",
-        * but there is a race between testing the bit and pointer outside the
-        * spinlock, and grabbing the spinlock.
-        * if we lost that race, we retry.  */
-       if (rw == WRITE && (remote || send_oos) &&
-           mdev->unused_spare_tle == NULL &&
-           drbd_test_flag(mdev, CREATE_BARRIER)) {
- allocate_barrier:
-               b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO);
-               if (!b) {
-                       dev_err(DEV, "Failed to alloc barrier.\n");
-                       err = -ENOMEM;
-                       goto fail_free_complete;
-               }
+       spin_lock_irq(&mdev->tconn->req_lock);
+       if (rw == WRITE) {
+               /* This may temporarily give up the req_lock,
+                * but will re-aquire it before it returns here.
+                * Needs to be before the check on drbd_suspended() */
+               complete_conflicting_writes(req);
        }
  
-       /* GOOD, everything prepared, grab the spin_lock */
-       spin_lock_irq(&mdev->req_lock);
-       if (is_susp(mdev->state)) {
-               /* If we got suspended, use the retry mechanism of
-                  drbd_make_request() to restart processing of this
-                  bio. In the next call to drbd_make_request
-                  we sleep in inc_ap_bio() */
-               ret = 1;
-               spin_unlock_irq(&mdev->req_lock);
-               goto fail_free_complete;
-       }
+       /* no more giving up req_lock from now on! */
  
-       if (remote || send_oos) {
-               remote = drbd_should_do_remote(mdev->state);
-               send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
-               D_ASSERT(!(remote && send_oos));
-               if (!(remote || send_oos))
-                       dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
-               if (!(local || remote)) {
-                       dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
-                       spin_unlock_irq(&mdev->req_lock);
-                       goto fail_free_complete;
+       if (drbd_suspended(mdev)) {
+               /* push back and retry: */
+               req->rq_state |= RQ_POSTPONED;
+               if (req->private_bio) {
+                       bio_put(req->private_bio);
+                       req->private_bio = NULL;
+                       put_ldev(mdev);
                }
+               goto out;
        }
  
-       if (b && mdev->unused_spare_tle == NULL) {
-               mdev->unused_spare_tle = b;
-               b = NULL;
-       }
-       if (rw == WRITE && (remote || send_oos) &&
-           mdev->unused_spare_tle == NULL &&
-           drbd_test_flag(mdev, CREATE_BARRIER)) {
-               /* someone closed the current epoch
-                * while we were grabbing the spinlock */
-               spin_unlock_irq(&mdev->req_lock);
-               goto allocate_barrier;
-       }
        /* Update disk stats */
        _drbd_start_io_acct(mdev, req, bio);
  
-       /* _maybe_start_new_epoch(mdev);
-        * If we need to generate a write barrier packet, we have to add the
-        * new epoch (barrier) object, and queue the barrier packet for sending,
-        * and queue the req's data after it _within the same lock_, otherwise
-        * we have race conditions were the reorder domains could be mixed up.
-        *
-        * Even read requests may start a new epoch and queue the corresponding
-        * barrier packet.  To get the write ordering right, we only have to
-        * make sure that, if this is a write request and it triggered a
-        * barrier packet, this request is queued within the same spinlock. */
-       if ((remote || send_oos) && mdev->unused_spare_tle &&
-           drbd_test_and_clear_flag(mdev, CREATE_BARRIER)) {
-               _tl_add_barrier(mdev, mdev->unused_spare_tle);
-               mdev->unused_spare_tle = NULL;
-       } else {
-               D_ASSERT(!(remote && rw == WRITE &&
-                          drbd_test_flag(mdev, CREATE_BARRIER)));
+       /* We fail READ/READA early, if we can not serve it.
+        * We must do this before req is registered on any lists.
+        * Otherwise, drbd_req_complete() will queue failed READ for retry. */
+       if (rw != WRITE) {
+               if (!do_remote_read(req) && !req->private_bio)
+                       goto nodata;
        }
  
-       /* NOTE
-        * Actually, 'local' may be wrong here already, since we may have failed
-        * to write to the meta data, and may become wrong anytime because of
-        * local io-error for some other request, which would lead to us
-        * "detaching" the local disk.
-        *
-        * 'remote' may become wrong any time because the network could fail.
-        *
-        * This is a harmless race condition, though, since it is handled
-        * correctly at the appropriate places; so it just defers the failure
-        * of the respective operation.
-        */
-       /* mark them early for readability.
-        * this just sets some state flags. */
-       if (remote)
-               _req_mod(req, to_be_send);
-       if (local)
-               _req_mod(req, to_be_submitted);
-       /* check this request on the collision detection hash tables.
-        * if we have a conflict, just complete it here.
-        * THINK do we want to check reads, too? (I don't think so...) */
-       if (rw == WRITE && _req_conflicts(req))
-               goto fail_conflicting;
+       /* which transfer log epoch does this belong to? */
+       req->epoch = atomic_read(&mdev->tconn->current_tle_nr);
  
        /* no point in adding empty flushes to the transfer log,
         * they are mapped to drbd barriers already. */
-       if (likely(size!=0))
-               list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
+       if (likely(req->i.size!=0)) {
+               if (rw == WRITE)
+                       mdev->tconn->current_tle_writes++;
  
-       /* NOTE remote first: to get the concurrent write detection right,
-        * we must register the request before start of local IO.  */
-       if (remote) {
-               /* either WRITE and C_CONNECTED,
-                * or READ, and no local disk,
-                * or READ, but not in sync.
-                */
-               _req_mod(req, (rw == WRITE)
-                               ? queue_for_net_write
-                               : queue_for_net_read);
+               list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log);
        }
-       if (send_oos && drbd_set_out_of_sync(mdev, sector, size))
-               _req_mod(req, queue_for_send_oos);
-       if (remote &&
-           mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96)
-               maybe_pull_ahead(mdev);
  
-       /* If this was a flush, queue a drbd barrier/start a new epoch.
-        * Unless the current epoch was empty anyways, or we are not currently
-        * replicating, in which case there is no point. */
-       if (unlikely(bio->bi_rw & REQ_FLUSH)
-               && mdev->newest_tle->n_writes
-               && drbd_should_do_remote(mdev->state))
-               queue_barrier(mdev);
-       spin_unlock_irq(&mdev->req_lock);
-       kfree(b); /* if someone else has beaten us to it... */
-       if (local) {
-               req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
-               /* State may have changed since we grabbed our reference on the
-                * mdev->ldev member. Double check, and short-circuit to endio.
-                * In case the last activity log transaction failed to get on
-                * stable storage, and this is a WRITE, we may not even submit
-                * this bio. */
-               if (get_ldev(mdev)) {
-                       if (drbd_insert_fault(mdev,   rw == WRITE ? DRBD_FAULT_DT_WR
-                                                   : rw == READ  ? DRBD_FAULT_DT_RD
-                                                   :               DRBD_FAULT_DT_RA))
-                               bio_endio(req->private_bio, -EIO);
-                       else
-                               generic_make_request(req->private_bio);
-                       put_ldev(mdev);
+       if (rw == WRITE) {
+               if (!drbd_process_write_request(req))
+                       no_remote = true;
+       } else {
+               /* We either have a private_bio, or we can read from remote.
+                * Otherwise we had done the goto nodata above. */
+               if (req->private_bio == NULL) {
+                       _req_mod(req, TO_BE_SENT);
+                       _req_mod(req, QUEUE_FOR_NET_READ);
                } else
-                       bio_endio(req->private_bio, -EIO);
+                       no_remote = true;
        }
  
-       return 0;
- fail_conflicting:
-       /* this is a conflicting request.
-        * even though it may have been only _partially_
-        * overlapping with one of the currently pending requests,
-        * without even submitting or sending it, we will
-        * pretend that it was successfully served right now.
-        */
-       _drbd_end_io_acct(mdev, req);
-       spin_unlock_irq(&mdev->req_lock);
-       if (remote)
-               dec_ap_pending(mdev);
-       /* THINK: do we want to fail it (-EIO), or pretend success?
-        * this pretends success. */
-       err = 0;
- fail_free_complete:
-       if (req->rq_state & RQ_IN_ACT_LOG)
-               drbd_al_complete_io(mdev, sector);
- fail_and_free_req:
-       if (local) {
-               bio_put(req->private_bio);
-               req->private_bio = NULL;
-               put_ldev(mdev);
+       if (req->private_bio) {
+               /* needs to be marked within the same spinlock */
+               _req_mod(req, TO_BE_SUBMITTED);
+               /* but we need to give up the spinlock to submit */
+               spin_unlock_irq(&mdev->tconn->req_lock);
+               drbd_submit_req_private_bio(req);
+               spin_lock_irq(&mdev->tconn->req_lock);
+       } else if (no_remote) {
+ nodata:
+               if (__ratelimit(&drbd_ratelimit_state))
+                       dev_err(DEV, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
+                                       (unsigned long long)req->i.sector, req->i.size >> 9);
+               /* A write may have been queued for send_oos, however.
+                * So we can not simply free it, we must go through drbd_req_put_completion_ref() */
        }
-       if (!ret)
-               bio_endio(bio, err);
-       drbd_req_free(req);
-       dec_ap_bio(mdev);
-       kfree(b);
-       return ret;
- }
  
- /* helper function for drbd_make_request
-  * if we can determine just by the mdev (state) that this request will fail,
-  * return 1
-  * otherwise return 0
-  */
- static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
- {
-       if (mdev->state.role != R_PRIMARY &&
-               (!allow_oos || is_write)) {
-               if (__ratelimit(&drbd_ratelimit_state)) {
-                       dev_err(DEV, "Process %s[%u] tried to %s; "
-                           "since we are not in Primary state, "
-                           "we cannot allow this\n",
-                           current->comm, current->pid,
-                           is_write ? "WRITE" : "READ");
-               }
-               return 1;
-       }
+ out:
+       if (drbd_req_put_completion_ref(req, &m, 1))
+               kref_put(&req->kref, drbd_req_destroy);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
-       return 0;
+       if (m.bio)
+               complete_master_bio(mdev, &m);
+       return;
  }
  
 -int drbd_make_request(struct request_queue *q, struct bio *bio)
 +void drbd_make_request(struct request_queue *q, struct bio *bio)
  {
-       unsigned int s_enr, e_enr;
        struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
        unsigned long start_time;
  
-       if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
-               bio_endio(bio, -EPERM);
-               return;
-       }
        start_time = jiffies;
  
        /*
         * what we "blindly" assume:
         */
-       D_ASSERT((bio->bi_size & 0x1ff) == 0);
-       /* to make some things easier, force alignment of requests within the
-        * granularity of our hash tables */
-       s_enr = bio->bi_sector >> HT_SHIFT;
-       e_enr = bio->bi_size ? (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT : s_enr;
-       if (likely(s_enr == e_enr)) {
-               do {
-                       inc_ap_bio(mdev, 1);
-               } while (drbd_make_request_common(mdev, bio, start_time));
-               return;
-       }
-       /* can this bio be split generically?
-        * Maybe add our own split-arbitrary-bios function. */
-       if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) {
-               /* rather error out here than BUG in bio_split */
-               dev_err(DEV, "bio would need to, but cannot, be split: "
-                   "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
-                   bio->bi_vcnt, bio->bi_idx, bio->bi_size,
-                   (unsigned long long)bio->bi_sector);
-               bio_endio(bio, -EINVAL);
-       } else {
-               /* This bio crosses some boundary, so we have to split it. */
-               struct bio_pair *bp;
-               /* works for the "do not cross hash slot boundaries" case
-                * e.g. sector 262269, size 4096
-                * s_enr = 262269 >> 6 = 4097
-                * e_enr = (262269+8-1) >> 6 = 4098
-                * HT_SHIFT = 6
-                * sps = 64, mask = 63
-                * first_sectors = 64 - (262269 & 63) = 3
-                */
-               const sector_t sect = bio->bi_sector;
-               const int sps = 1 << HT_SHIFT; /* sectors per slot */
-               const int mask = sps - 1;
-               const sector_t first_sectors = sps - (sect & mask);
-               bp = bio_split(bio, first_sectors);
+       D_ASSERT(IS_ALIGNED(bio->bi_size, 512));
  
-               /* we need to get a "reference count" (ap_bio_cnt)
-                * to avoid races with the disconnect/reconnect/suspend code.
-                * In case we need to split the bio here, we need to get three references
-                * atomically, otherwise we might deadlock when trying to submit the
-                * second one! */
-               inc_ap_bio(mdev, 3);
-               D_ASSERT(e_enr == s_enr + 1);
-               while (drbd_make_request_common(mdev, &bp->bio1, start_time))
-                       inc_ap_bio(mdev, 1);
-               while (drbd_make_request_common(mdev, &bp->bio2, start_time))
-                       inc_ap_bio(mdev, 1);
-               dec_ap_bio(mdev);
-               bio_pair_release(bp);
-       }
+       inc_ap_bio(mdev);
+       __drbd_make_request(mdev, bio, start_time);
 -
 -      return 0;
  }
  
- /* This is called by bio_add_page().  With this function we reduce
-  * the number of BIOs that span over multiple DRBD_MAX_BIO_SIZEs
-  * units (was AL_EXTENTs).
+ /* This is called by bio_add_page().
+  *
+  * q->max_hw_sectors and other global limits are already enforced there.
   *
-  * we do the calculation within the lower 32bit of the byte offsets,
-  * since we don't care for actual offset, but only check whether it
-  * would cross "activity log extent" boundaries.
+  * We need to call down to our lower level device,
+  * in case it has special restrictions.
+  *
+  * We also may need to enforce configured max-bio-bvecs limits.
   *
   * As long as the BIO is empty we have to allow at least one bvec,
-  * regardless of size and offset.  so the resulting bio may still
-  * cross extent boundaries.  those are dealt with (bio_split) in
-  * drbd_make_request.
+  * regardless of size and offset, so no need to ask lower levels.
   */
  int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
  {
        struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
-       unsigned int bio_offset =
-               (unsigned int)bvm->bi_sector << 9; /* 32 bit */
        unsigned int bio_size = bvm->bi_size;
-       int limit, backing_limit;
-       limit = DRBD_MAX_BIO_SIZE
-             - ((bio_offset & (DRBD_MAX_BIO_SIZE-1)) + bio_size);
-       if (limit < 0)
-               limit = 0;
-       if (bio_size == 0) {
-               if (limit <= bvec->bv_len)
-                       limit = bvec->bv_len;
-       } else if (limit && get_ldev(mdev)) {
+       int limit = DRBD_MAX_BIO_SIZE;
+       int backing_limit;
+       if (bio_size && get_ldev(mdev)) {
                struct request_queue * const b =
                        mdev->ldev->backing_bdev->bd_disk->queue;
                if (b->merge_bvec_fn) {
        return limit;
  }
  
+ struct drbd_request *find_oldest_request(struct drbd_tconn *tconn)
+ {
+       /* Walk the transfer log,
+        * and find the oldest not yet completed request */
+       struct drbd_request *r;
+       list_for_each_entry(r, &tconn->transfer_log, tl_requests) {
+               if (atomic_read(&r->completion_ref))
+                       return r;
+       }
+       return NULL;
+ }
  void request_timer_fn(unsigned long data)
  {
        struct drbd_conf *mdev = (struct drbd_conf *) data;
+       struct drbd_tconn *tconn = mdev->tconn;
        struct drbd_request *req; /* oldest request */
-       struct list_head *le;
+       struct net_conf *nc;
        unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
        unsigned long now;
  
-       if (get_net_conf(mdev)) {
-               if (mdev->state.conn >= C_WF_REPORT_PARAMS)
-                       ent = mdev->net_conf->timeout*HZ/10
-                               * mdev->net_conf->ko_count;
-               put_net_conf(mdev);
-       }
+       rcu_read_lock();
+       nc = rcu_dereference(tconn->net_conf);
+       if (nc && mdev->state.conn >= C_WF_REPORT_PARAMS)
+               ent = nc->timeout * HZ/10 * nc->ko_count;
        if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */
-               dt = mdev->ldev->dc.disk_timeout * HZ / 10;
+               dt = rcu_dereference(mdev->ldev->disk_conf)->disk_timeout * HZ / 10;
                put_ldev(mdev);
        }
+       rcu_read_unlock();
        et = min_not_zero(dt, ent);
  
        if (!et)
  
        now = jiffies;
  
-       spin_lock_irq(&mdev->req_lock);
-       le = &mdev->oldest_tle->requests;
-       if (list_empty(le)) {
-               spin_unlock_irq(&mdev->req_lock);
+       spin_lock_irq(&tconn->req_lock);
+       req = find_oldest_request(tconn);
+       if (!req) {
+               spin_unlock_irq(&tconn->req_lock);
                mod_timer(&mdev->request_timer, now + et);
                return;
        }
  
-       le = le->prev;
-       req = list_entry(le, struct drbd_request, tl_requests);
        /* The request is considered timed out, if
         * - we have some effective timeout from the configuration,
         *   with above state restrictions applied,
         */
        if (ent && req->rq_state & RQ_NET_PENDING &&
                 time_after(now, req->start_time + ent) &&
-               !time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) {
+               !time_in_range(now, tconn->last_reconnect_jif, tconn->last_reconnect_jif + ent)) {
                dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
                _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
        }
-       if (dt && req->rq_state & RQ_LOCAL_PENDING &&
+       if (dt && req->rq_state & RQ_LOCAL_PENDING && req->w.mdev == mdev &&
                 time_after(now, req->start_time + dt) &&
                !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) {
                dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n");
                __drbd_chk_io_error(mdev, DRBD_FORCE_DETACH);
        }
        nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et;
-       spin_unlock_irq(&mdev->req_lock);
+       spin_unlock_irq(&tconn->req_lock);
        mod_timer(&mdev->request_timer, nt);
  }
diff --combined include/linux/idr.h
index 87259a44c251472595598f0f6ed0852655171554,52a9da295296ce072a5326b70441e8eda9b8eff9..de7e190f1af4ea7d7b596a3a224e40ccbc7a6a44
  #define IDR_SIZE (1 << IDR_BITS)
  #define IDR_MASK ((1 << IDR_BITS)-1)
  
 -#define MAX_ID_SHIFT (sizeof(int)*8 - 1)
 -#define MAX_ID_BIT (1U << MAX_ID_SHIFT)
 -#define MAX_ID_MASK (MAX_ID_BIT - 1)
 +#define MAX_IDR_SHIFT (sizeof(int)*8 - 1)
 +#define MAX_IDR_BIT (1U << MAX_IDR_SHIFT)
 +#define MAX_IDR_MASK (MAX_IDR_BIT - 1)
  
  /* Leave the possibility of an incomplete final layer */
 -#define MAX_LEVEL (MAX_ID_SHIFT + IDR_BITS - 1) / IDR_BITS
 +#define MAX_IDR_LEVEL ((MAX_IDR_SHIFT + IDR_BITS - 1) / IDR_BITS)
  
  /* Number of id_layer structs to leave in free list */
 -#define IDR_FREE_MAX MAX_LEVEL + MAX_LEVEL
 +#define MAX_IDR_FREE (MAX_IDR_LEVEL * 2)
  
  struct idr_layer {
        unsigned long            bitmap; /* A zero bit means "space here" */
@@@ -152,4 -152,15 +152,15 @@@ void ida_simple_remove(struct ida *ida
  
  void __init idr_init_cache(void);
  
+ /**
+  * idr_for_each_entry - iterate over an idr's elements of a given type
+  * @idp:     idr handle
+  * @entry:   the type * to use as cursor
+  * @id:      id entry's key
+  */
+ #define idr_for_each_entry(idp, entry, id)                            \
+       for (id = 0, entry = (typeof(entry))idr_get_next((idp), &(id)); \
+            entry != NULL;                                             \
+            ++id, entry = (typeof(entry))idr_get_next((idp), &(id)))
  #endif /* __IDR_H__ */