]> Pileus Git - ~andy/linux/commitdiff
Merge branch 'integrity-check-patch-v2' of git://btrfs.giantdisaster.de/git/btrfs...
authorChris Mason <chris.mason@oracle.com>
Mon, 16 Jan 2012 20:27:58 +0000 (15:27 -0500)
committerChris Mason <chris.mason@oracle.com>
Mon, 16 Jan 2012 20:27:58 +0000 (15:27 -0500)
Conflicts:
fs/btrfs/ctree.h
fs/btrfs/super.c

Signed-off-by: Chris Mason <chris.mason@oracle.com>
1  2 
fs/btrfs/Makefile
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent_io.c
fs/btrfs/scrub.c
fs/btrfs/super.c
fs/btrfs/volumes.c

diff --combined fs/btrfs/Makefile
index 70798407b9a2ebd48498319496efd1e5ba33789b,bc5b3556cee689f02d8e9a0fb8c7752ab77307fa..0c4fa2befae793f1a6845322d7ba71aaa5da4374
@@@ -8,6 -8,7 +8,7 @@@ btrfs-y += super.o ctree.o extent-tree.
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
           export.o tree-log.o free-space-cache.o zlib.o lzo.o \
           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
 -         reada.o backref.o
 +         reada.o backref.o ulist.o
  
  btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
+ btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --combined fs/btrfs/ctree.h
index b6d1020c4571870660998d61ea4cf34cfbaaca35,39f6188688e602174de1d0e91029522baafa9d55..3c2cbf7b666355b1106fb3fae80451b66ac56086
@@@ -86,9 -86,6 +86,9 @@@ struct btrfs_ordered_sum
  /* holds checksums of all the data extents */
  #define BTRFS_CSUM_TREE_OBJECTID 7ULL
  
 +/* for storing balance parameters in the root tree */
 +#define BTRFS_BALANCE_OBJECTID -4ULL
 +
  /* orhpan objectid for tracking unlinked/truncated files */
  #define BTRFS_ORPHAN_OBJECTID -5ULL
  
@@@ -695,54 -692,6 +695,54 @@@ struct btrfs_root_ref 
        __le16 name_len;
  } __attribute__ ((__packed__));
  
 +struct btrfs_disk_balance_args {
 +      /*
 +       * profiles to operate on, single is denoted by
 +       * BTRFS_AVAIL_ALLOC_BIT_SINGLE
 +       */
 +      __le64 profiles;
 +
 +      /* usage filter */
 +      __le64 usage;
 +
 +      /* devid filter */
 +      __le64 devid;
 +
 +      /* devid subset filter [pstart..pend) */
 +      __le64 pstart;
 +      __le64 pend;
 +
 +      /* btrfs virtual address space subset filter [vstart..vend) */
 +      __le64 vstart;
 +      __le64 vend;
 +
 +      /*
 +       * profile to convert to, single is denoted by
 +       * BTRFS_AVAIL_ALLOC_BIT_SINGLE
 +       */
 +      __le64 target;
 +
 +      /* BTRFS_BALANCE_ARGS_* */
 +      __le64 flags;
 +
 +      __le64 unused[8];
 +} __attribute__ ((__packed__));
 +
 +/*
 + * store balance parameters to disk so that balance can be properly
 + * resumed after crash or unmount
 + */
 +struct btrfs_balance_item {
 +      /* BTRFS_BALANCE_* */
 +      __le64 flags;
 +
 +      struct btrfs_disk_balance_args data;
 +      struct btrfs_disk_balance_args meta;
 +      struct btrfs_disk_balance_args sys;
 +
 +      __le64 unused[4];
 +} __attribute__ ((__packed__));
 +
  #define BTRFS_FILE_EXTENT_INLINE 0
  #define BTRFS_FILE_EXTENT_REG 1
  #define BTRFS_FILE_EXTENT_PREALLOC 2
@@@ -802,32 -751,14 +802,32 @@@ struct btrfs_csum_item 
  } __attribute__ ((__packed__));
  
  /* different types of block groups (and chunks) */
 -#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
 -#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
 -#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
 -#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
 -#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
 -#define BTRFS_BLOCK_GROUP_DUP    (1 << 5)
 -#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
 -#define BTRFS_NR_RAID_TYPES      5
 +#define BTRFS_BLOCK_GROUP_DATA                (1ULL << 0)
 +#define BTRFS_BLOCK_GROUP_SYSTEM      (1ULL << 1)
 +#define BTRFS_BLOCK_GROUP_METADATA    (1ULL << 2)
 +#define BTRFS_BLOCK_GROUP_RAID0               (1ULL << 3)
 +#define BTRFS_BLOCK_GROUP_RAID1               (1ULL << 4)
 +#define BTRFS_BLOCK_GROUP_DUP         (1ULL << 5)
 +#define BTRFS_BLOCK_GROUP_RAID10      (1ULL << 6)
 +#define BTRFS_BLOCK_GROUP_RESERVED    BTRFS_AVAIL_ALLOC_BIT_SINGLE
 +#define BTRFS_NR_RAID_TYPES           5
 +
 +#define BTRFS_BLOCK_GROUP_TYPE_MASK   (BTRFS_BLOCK_GROUP_DATA |    \
 +                                       BTRFS_BLOCK_GROUP_SYSTEM |  \
 +                                       BTRFS_BLOCK_GROUP_METADATA)
 +
 +#define BTRFS_BLOCK_GROUP_PROFILE_MASK        (BTRFS_BLOCK_GROUP_RAID0 |   \
 +                                       BTRFS_BLOCK_GROUP_RAID1 |   \
 +                                       BTRFS_BLOCK_GROUP_DUP |     \
 +                                       BTRFS_BLOCK_GROUP_RAID10)
 +/*
 + * We need a bit for restriper to be able to tell when chunks of type
 + * SINGLE are available.  This "extended" profile format is used in
 + * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
 + * (on-disk).  The corresponding on-disk bit in chunk.type is reserved
 + * to avoid remappings between two formats in future.
 + */
 +#define BTRFS_AVAIL_ALLOC_BIT_SINGLE  (1ULL << 48)
  
  struct btrfs_block_group_item {
        __le64 used;
@@@ -985,7 -916,6 +985,7 @@@ struct btrfs_block_group_cache 
  struct reloc_control;
  struct btrfs_device;
  struct btrfs_fs_devices;
 +struct btrfs_balance_control;
  struct btrfs_delayed_root;
  struct btrfs_fs_info {
        u8 fsid[BTRFS_FSID_SIZE];
         * is required instead of the faster short fsync log commits
         */
        u64 last_trans_log_full_commit;
-       unsigned long mount_opt:20;
+       unsigned long mount_opt:21;
        unsigned long compress_type:4;
        u64 max_inline;
        u64 alloc_start;
        spinlock_t ref_cache_lock;
        u64 total_ref_cache_size;
  
 +      /*
 +       * these three are in extended format (availability of single
 +       * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
 +       * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
 +       */
        u64 avail_data_alloc_bits;
        u64 avail_metadata_alloc_bits;
        u64 avail_system_alloc_bits;
 -      u64 data_alloc_profile;
 -      u64 metadata_alloc_profile;
 -      u64 system_alloc_profile;
 +
 +      /* restriper state */
 +      spinlock_t balance_lock;
 +      struct mutex balance_mutex;
 +      atomic_t balance_running;
 +      atomic_t balance_pause_req;
 +      atomic_t balance_cancel_req;
 +      struct btrfs_balance_control *balance_ctl;
 +      wait_queue_head_t balance_wait_q;
  
        unsigned data_chunk_allocations;
        unsigned metadata_ratio;
        int scrub_workers_refcnt;
        struct btrfs_workers scrub_workers;
  
+ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       u32 check_integrity_print_mask;
+ #endif
        /* filesystem state */
        u64 fs_state;
  
@@@ -1464,8 -1387,6 +1468,8 @@@ struct btrfs_ioctl_defrag_range_args 
  #define BTRFS_DEV_ITEM_KEY    216
  #define BTRFS_CHUNK_ITEM_KEY  228
  
 +#define BTRFS_BALANCE_ITEM_KEY        248
 +
  /*
   * string items are for debugging.  They just store a short string of
   * data in the FS
  #define BTRFS_MOUNT_AUTO_DEFRAG               (1 << 16)
  #define BTRFS_MOUNT_INODE_MAP_CACHE   (1 << 17)
  #define BTRFS_MOUNT_RECOVERY          (1 << 18)
 -#define BTRFS_MOUNT_CHECK_INTEGRITY   (1 << 19)
 -#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 20)
 +#define BTRFS_MOUNT_SKIP_BALANCE      (1 << 19)
++#define BTRFS_MOUNT_CHECK_INTEGRITY   (1 << 20)
++#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
  
  #define btrfs_clear_opt(o, opt)               ((o) &= ~BTRFS_MOUNT_##opt)
  #define btrfs_set_opt(o, opt)         ((o) |= BTRFS_MOUNT_##opt)
@@@ -2161,86 -2083,8 +2167,86 @@@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_u
  BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
                   num_devices, 64);
  
 -/* struct btrfs_super_block */
 +/* struct btrfs_balance_item */
 +BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
 +
 +static inline void btrfs_balance_data(struct extent_buffer *eb,
 +                                    struct btrfs_balance_item *bi,
 +                                    struct btrfs_disk_balance_args *ba)
 +{
 +      read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
 +}
  
 +static inline void btrfs_set_balance_data(struct extent_buffer *eb,
 +                                        struct btrfs_balance_item *bi,
 +                                        struct btrfs_disk_balance_args *ba)
 +{
 +      write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
 +}
 +
 +static inline void btrfs_balance_meta(struct extent_buffer *eb,
 +                                    struct btrfs_balance_item *bi,
 +                                    struct btrfs_disk_balance_args *ba)
 +{
 +      read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
 +}
 +
 +static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
 +                                        struct btrfs_balance_item *bi,
 +                                        struct btrfs_disk_balance_args *ba)
 +{
 +      write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
 +}
 +
 +static inline void btrfs_balance_sys(struct extent_buffer *eb,
 +                                   struct btrfs_balance_item *bi,
 +                                   struct btrfs_disk_balance_args *ba)
 +{
 +      read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
 +}
 +
 +static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
 +                                       struct btrfs_balance_item *bi,
 +                                       struct btrfs_disk_balance_args *ba)
 +{
 +      write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
 +}
 +
 +static inline void
 +btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
 +                             struct btrfs_disk_balance_args *disk)
 +{
 +      memset(cpu, 0, sizeof(*cpu));
 +
 +      cpu->profiles = le64_to_cpu(disk->profiles);
 +      cpu->usage = le64_to_cpu(disk->usage);
 +      cpu->devid = le64_to_cpu(disk->devid);
 +      cpu->pstart = le64_to_cpu(disk->pstart);
 +      cpu->pend = le64_to_cpu(disk->pend);
 +      cpu->vstart = le64_to_cpu(disk->vstart);
 +      cpu->vend = le64_to_cpu(disk->vend);
 +      cpu->target = le64_to_cpu(disk->target);
 +      cpu->flags = le64_to_cpu(disk->flags);
 +}
 +
 +static inline void
 +btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
 +                             struct btrfs_balance_args *cpu)
 +{
 +      memset(disk, 0, sizeof(*disk));
 +
 +      disk->profiles = cpu_to_le64(cpu->profiles);
 +      disk->usage = cpu_to_le64(cpu->usage);
 +      disk->devid = cpu_to_le64(cpu->devid);
 +      disk->pstart = cpu_to_le64(cpu->pstart);
 +      disk->pend = cpu_to_le64(cpu->pend);
 +      disk->vstart = cpu_to_le64(cpu->vstart);
 +      disk->vend = cpu_to_le64(cpu->vend);
 +      disk->target = cpu_to_le64(cpu->target);
 +      disk->flags = cpu_to_le64(cpu->flags);
 +}
 +
 +/* struct btrfs_super_block */
  BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
  BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
  BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@@ -2439,11 -2283,11 +2445,11 @@@ struct extent_buffer *btrfs_alloc_free_
                                        struct btrfs_root *root, u32 blocksize,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
 -                                      u64 hint, u64 empty_size);
 +                                      u64 hint, u64 empty_size, int for_cow);
  void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
 -                         u64 parent, int last_ref);
 +                         u64 parent, int last_ref, int for_cow);
  struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
@@@ -2463,17 -2307,17 +2469,17 @@@ int btrfs_reserve_extent(struct btrfs_t
                                  u64 search_end, struct btrfs_key *ins,
                                  u64 data);
  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 -                struct extent_buffer *buf, int full_backref);
 +                struct extent_buffer *buf, int full_backref, int for_cow);
  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 -                struct extent_buffer *buf, int full_backref);
 +                struct extent_buffer *buf, int full_backref, int for_cow);
  int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 flags,
                                int is_data);
  int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
 -                    u64 bytenr, u64 num_bytes, u64 parent,
 -                    u64 root_objectid, u64 owner, u64 offset);
 +                    u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
 +                    u64 owner, u64 offset, int for_cow);
  
  int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
  int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@@ -2485,7 -2329,7 +2491,7 @@@ int btrfs_finish_extent_commit(struct b
  int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
 -                       u64 root_objectid, u64 owner, u64 offset);
 +                       u64 root_objectid, u64 owner, u64 offset, int for_cow);
  
  int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root);
@@@ -2644,18 -2488,10 +2650,18 @@@ static inline int btrfs_insert_empty_it
  }
  
  int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 +static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
 +{
 +      ++p->slots[0];
 +      if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
 +              return btrfs_next_leaf(root, p);
 +      return 0;
 +}
  int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
  int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
  void btrfs_drop_snapshot(struct btrfs_root *root,
 -                       struct btrfs_block_rsv *block_rsv, int update_ref);
 +                       struct btrfs_block_rsv *block_rsv, int update_ref,
 +                       int for_reloc);
  int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
@@@ -2670,7 -2506,6 +2676,7 @@@ static inline int btrfs_fs_closing(stru
  }
  static inline void free_fs_info(struct btrfs_fs_info *fs_info)
  {
 +      kfree(fs_info->balance_ctl);
        kfree(fs_info->delayed_root);
        kfree(fs_info->extent_root);
        kfree(fs_info->tree_root);
        kfree(fs_info->super_for_commit);
        kfree(fs_info);
  }
 +/**
 + * profile_is_valid - tests whether a given profile is valid and reduced
 + * @flags: profile to validate
 + * @extended: if true @flags is treated as an extended profile
 + */
 +static inline int profile_is_valid(u64 flags, int extended)
 +{
 +      u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
 +
 +      flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
 +      if (extended)
 +              mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
 +
 +      if (flags & mask)
 +              return 0;
 +      /* true if zero or exactly one bit set */
 +      return (flags & (~flags + 1)) == flags;
 +}
  
  /* root-item.c */
  int btrfs_find_root_ref(struct btrfs_root *tree_root,
diff --combined fs/btrfs/disk-io.c
index 9be97716c5e068153b83a5e2d493c0d010c16710,f363c6d9c3de428977ad4a1d77aeae214ef5865a..da4457f84d78d7246ac8ce4f1640664f90d4e92f
@@@ -43,6 -43,7 +43,7 @@@
  #include "tree-log.h"
  #include "free-space-cache.h"
  #include "inode-map.h"
+ #include "check-integrity.h"
  
  static struct extent_io_ops btree_extent_io_ops;
  static void end_workqueue_fn(struct btrfs_work *work);
@@@ -1243,8 -1244,7 +1244,8 @@@ static struct btrfs_root *alloc_log_tre
        root->ref_cows = 0;
  
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
 -                                    BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
 +                                    BTRFS_TREE_LOG_OBJECTID, NULL,
 +                                    0, 0, 0, 0);
        if (IS_ERR(leaf)) {
                kfree(root);
                return ERR_CAST(leaf);
@@@ -2002,21 -2002,16 +2003,24 @@@ struct btrfs_root *open_ctree(struct su
        init_waitqueue_head(&fs_info->scrub_pause_wait);
        init_rwsem(&fs_info->scrub_super_lock);
        fs_info->scrub_workers_refcnt = 0;
+ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       fs_info->check_integrity_print_mask = 0;
+ #endif
  
 +      spin_lock_init(&fs_info->balance_lock);
 +      mutex_init(&fs_info->balance_mutex);
 +      atomic_set(&fs_info->balance_running, 0);
 +      atomic_set(&fs_info->balance_pause_req, 0);
 +      atomic_set(&fs_info->balance_cancel_req, 0);
 +      fs_info->balance_ctl = NULL;
 +      init_waitqueue_head(&fs_info->balance_wait_q);
 +
        sb->s_blocksize = 4096;
        sb->s_blocksize_bits = blksize_bits(4096);
        sb->s_bdi = &fs_info->bdi;
  
        fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
 -      fs_info->btree_inode->i_nlink = 1;
 +      set_nlink(fs_info->btree_inode, 1);
        /*
         * we set the i_size on the btree inode to the max possible int.
         * the real end of the address space is determined by all of
           (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
           BTRFS_UUID_SIZE);
  
 -      mutex_lock(&fs_info->chunk_mutex);
        ret = btrfs_read_chunk_tree(chunk_root);
 -      mutex_unlock(&fs_info->chunk_mutex);
        if (ret) {
                printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
                       sb->s_id);
@@@ -2328,6 -2325,9 +2332,6 @@@ retry_root_backup
  
        fs_info->generation = generation;
        fs_info->last_trans_committed = generation;
 -      fs_info->data_alloc_profile = (u64)-1;
 -      fs_info->metadata_alloc_profile = (u64)-1;
 -      fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
  
        ret = btrfs_init_space_info(fs_info);
        if (ret) {
                btrfs_set_opt(fs_info->mount_opt, SSD);
        }
  
+ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+               ret = btrfsic_mount(tree_root, fs_devices,
+                                   btrfs_test_opt(tree_root,
+                                       CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
+                                   1 : 0,
+                                   fs_info->check_integrity_print_mask);
+               if (ret)
+                       printk(KERN_WARNING "btrfs: failed to initialize"
+                              " integrity check module %s\n", sb->s_id);
+       }
+ #endif
        /* do not make disk changes in broken FS */
        if (btrfs_super_log_root(disk_super) != 0 &&
            !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
                if (!err)
                        err = btrfs_orphan_cleanup(fs_info->tree_root);
                up_read(&fs_info->cleanup_work_sem);
 +
 +              if (!err)
 +                      err = btrfs_recover_balance(fs_info->tree_root);
 +
                if (err) {
                        close_ctree(tree_root);
                        return ERR_PTR(err);
@@@ -2642,7 -2651,7 +2659,7 @@@ static int write_dev_supers(struct btrf
                 * we fua the first super.  The others we allow
                 * to go down lazy.
                 */
-               ret = submit_bh(WRITE_FUA, bh);
+               ret = btrfsic_submit_bh(WRITE_FUA, bh);
                if (ret)
                        errors++;
        }
@@@ -2719,7 -2728,7 +2736,7 @@@ static int write_dev_flush(struct btrfs
        device->flush_bio = bio;
  
        bio_get(bio);
-       submit_bio(WRITE_FLUSH, bio);
+       btrfsic_submit_bio(WRITE_FLUSH, bio);
  
        return 0;
  }
@@@ -2983,9 -2992,6 +3000,9 @@@ int close_ctree(struct btrfs_root *root
        fs_info->closing = 1;
        smp_mb();
  
 +      /* pause restriper - we want to resume on mount */
 +      btrfs_pause_balance(root->fs_info);
 +
        btrfs_scrub_cancel(root);
  
        /* wait for any defraggers to finish */
        btrfs_stop_workers(&fs_info->caching_workers);
        btrfs_stop_workers(&fs_info->readahead_workers);
  
+ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       if (btrfs_test_opt(root, CHECK_INTEGRITY))
+               btrfsic_unmount(root, fs_info->fs_devices);
+ #endif
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
  
diff --combined fs/btrfs/extent_io.c
index 3622cc22ff919d4477f6b3f930ac2b86443f6ba7,246669296e0252298d3944095f32c1c1bf2bc521..9d09a4f81875817ebc45a7c5b80cbe6008061b22
@@@ -18,6 -18,7 +18,7 @@@
  #include "ctree.h"
  #include "btrfs_inode.h"
  #include "volumes.h"
+ #include "check-integrity.h"
  
  static struct kmem_cache *extent_state_cache;
  static struct kmem_cache *extent_buffer_cache;
@@@ -1895,7 -1896,7 +1896,7 @@@ int repair_io_failure(struct btrfs_mapp
        }
        bio->bi_bdev = dev->bdev;
        bio_add_page(bio, page, length, start-page_offset(page));
-       submit_bio(WRITE_SYNC, bio);
+       btrfsic_submit_bio(WRITE_SYNC, bio);
        wait_for_completion(&compl);
  
        if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@@ -2393,7 -2394,7 +2394,7 @@@ static int submit_one_bio(int rw, struc
                ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
                                           mirror_num, bio_flags, start);
        else
-               submit_bio(rw, bio);
+               btrfsic_submit_bio(rw, bio);
  
        if (bio_flagged(bio, BIO_EOPNOTSUPP))
                ret = -EOPNOTSUPP;
@@@ -3579,7 -3580,6 +3580,7 @@@ static struct extent_buffer *__alloc_ex
        atomic_set(&eb->blocking_writers, 0);
        atomic_set(&eb->spinning_readers, 0);
        atomic_set(&eb->spinning_writers, 0);
 +      eb->lock_nested = 0;
        init_waitqueue_head(&eb->write_lock_wq);
        init_waitqueue_head(&eb->read_lock_wq);
  
diff --combined fs/btrfs/scrub.c
index 6a6a51a809ba1c56b83471ac38a825e9af9e3689,567e148caca2689a162a17bc59f1135cd329ba5a..9770cc5bfb76c6829f96924bb82f9b3b564ca646
@@@ -25,6 -25,7 +25,7 @@@
  #include "transaction.h"
  #include "backref.h"
  #include "extent_io.h"
+ #include "check-integrity.h"
  
  /*
   * This is only the first step towards a full-features scrub. It reads all
@@@ -309,7 -310,7 +310,7 @@@ static void scrub_print_warning(const c
        u8 ref_level;
        unsigned long ptr = 0;
        const int bufsize = 4096;
 -      u64 extent_offset;
 +      u64 extent_item_pos;
  
        path = btrfs_alloc_path();
  
        if (ret < 0)
                goto out;
  
 -      extent_offset = swarn.logical - found_key.objectid;
 +      extent_item_pos = swarn.logical - found_key.objectid;
        swarn.extent_item_size = found_key.offset;
  
        eb = path->nodes[0];
        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
        item_size = btrfs_item_size_nr(eb, path->slots[0]);
 +      btrfs_release_path(path);
  
        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                do {
        } else {
                swarn.path = path;
                iterate_extent_inodes(fs_info, path, found_key.objectid,
 -                                      extent_offset,
 +                                      extent_item_pos,
                                        scrub_print_warning_inode, &swarn);
        }
  
@@@ -733,7 -733,7 +734,7 @@@ static int scrub_fixup_io(int rw, struc
        bio_add_page(bio, page, PAGE_SIZE, 0);
        bio->bi_end_io = scrub_fixup_end_io;
        bio->bi_private = &complete;
-       submit_bio(rw, bio);
+       btrfsic_submit_bio(rw, bio);
  
        /* this will also unplug the queue */
        wait_for_completion(&complete);
@@@ -959,7 -959,7 +960,7 @@@ static int scrub_submit(struct scrub_de
        sdev->curr = -1;
        atomic_inc(&sdev->in_flight);
  
-       submit_bio(READ, sbio->bio);
+       btrfsic_submit_bio(READ, sbio->bio);
  
        return 0;
  }
diff --combined fs/btrfs/super.c
index 5a7227fa93804c7b78bb205d6af94b52c55e1b32,22a2015f1d7be5a973f247c9f5c3edf341e2c4e9..61717a4eb14f78e3aee3902a38659a754637767d
@@@ -164,8 -164,10 +164,10 @@@ enum 
        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
 -      Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
 -      Opt_inode_cache, Opt_no_space_cache, Opt_recovery,
 +      Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
 +      Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+       Opt_check_integrity, Opt_check_integrity_including_extent_data,
+       Opt_check_integrity_print_mask,
        Opt_err,
  };
  
@@@ -201,7 -203,9 +203,10 @@@ static match_table_t tokens = 
        {Opt_inode_cache, "inode_cache"},
        {Opt_no_space_cache, "nospace_cache"},
        {Opt_recovery, "recovery"},
 +      {Opt_skip_balance, "skip_balance"},
+       {Opt_check_integrity, "check_int"},
+       {Opt_check_integrity_including_extent_data, "check_int_data"},
+       {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
        {Opt_err, NULL},
  };
  
@@@ -400,9 -404,37 +405,40 @@@ int btrfs_parse_options(struct btrfs_ro
                        printk(KERN_INFO "btrfs: enabling auto recovery");
                        btrfs_set_opt(info->mount_opt, RECOVERY);
                        break;
 +              case Opt_skip_balance:
 +                      btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
 +                      break;
+ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+               case Opt_check_integrity_including_extent_data:
+                       printk(KERN_INFO "btrfs: enabling check integrity"
+                              " including extent data\n");
+                       btrfs_set_opt(info->mount_opt,
+                                     CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
+                       btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+                       break;
+               case Opt_check_integrity:
+                       printk(KERN_INFO "btrfs: enabling check integrity\n");
+                       btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+                       break;
+               case Opt_check_integrity_print_mask:
+                       intarg = 0;
+                       match_int(&args[0], &intarg);
+                       if (intarg) {
+                               info->check_integrity_print_mask = intarg;
+                               printk(KERN_INFO "btrfs:"
+                                      " check_integrity_print_mask 0x%x\n",
+                                      info->check_integrity_print_mask);
+                       }
+                       break;
+ #else
+               case Opt_check_integrity_including_extent_data:
+               case Opt_check_integrity:
+               case Opt_check_integrity_print_mask:
+                       printk(KERN_ERR "btrfs: support for check_integrity*"
+                              " not compiled in!\n");
+                       ret = -EINVAL;
+                       goto out;
+ #endif
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
@@@ -728,8 -760,6 +764,8 @@@ static int btrfs_show_options(struct se
                seq_puts(seq, ",autodefrag");
        if (btrfs_test_opt(root, INODE_MAP_CACHE))
                seq_puts(seq, ",inode_cache");
 +      if (btrfs_test_opt(root, SKIP_BALANCE))
 +              seq_puts(seq, ",skip_balance");
        return 0;
  }
  
@@@ -833,9 -863,13 +869,9 @@@ static char *setup_root_args(char *args
  static struct dentry *mount_subvol(const char *subvol_name, int flags,
                                   const char *device_name, char *data)
  {
 -      struct super_block *s;
        struct dentry *root;
        struct vfsmount *mnt;
 -      struct mnt_namespace *ns_private;
        char *newargs;
 -      struct path path;
 -      int error;
  
        newargs = setup_root_args(data);
        if (!newargs)
        if (IS_ERR(mnt))
                return ERR_CAST(mnt);
  
 -      ns_private = create_mnt_ns(mnt);
 -      if (IS_ERR(ns_private)) {
 -              mntput(mnt);
 -              return ERR_CAST(ns_private);
 -      }
 +      root = mount_subtree(mnt, subvol_name);
  
 -      /*
 -       * This will trigger the automount of the subvol so we can just
 -       * drop the mnt we have here and return the dentry that we
 -       * found.
 -       */
 -      error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
 -                              LOOKUP_FOLLOW, &path);
 -      put_mnt_ns(ns_private);
 -      if (error)
 -              return ERR_PTR(error);
 -
 -      if (!is_subvolume_inode(path.dentry->d_inode)) {
 -              path_put(&path);
 -              mntput(mnt);
 -              error = -EINVAL;
 +      if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
 +              struct super_block *s = root->d_sb;
 +              dput(root);
 +              root = ERR_PTR(-EINVAL);
 +              deactivate_locked_super(s);
                printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
                                subvol_name);
 -              return ERR_PTR(-EINVAL);
        }
  
 -      /* Get a ref to the sb and the dentry we found and return it */
 -      s = path.mnt->mnt_sb;
 -      atomic_inc(&s->s_active);
 -      root = dget(path.dentry);
 -      path_put(&path);
 -      down_write(&s->s_umount);
 -
        return root;
  }
  
diff --combined fs/btrfs/volumes.c
index e0b7bb92a170c3ad529fca30ed9f6539646dbbe5,821334f6e3a1cab35bd7853c2c83f275efb30840..59e878f9fdcc6e6bf0ef0e325f3a2fce94011baa
@@@ -23,7 -23,6 +23,7 @@@
  #include <linux/random.h>
  #include <linux/iocontext.h>
  #include <linux/capability.h>
 +#include <linux/kthread.h>
  #include <asm/div64.h>
  #include "compat.h"
  #include "ctree.h"
@@@ -33,6 -32,7 +33,7 @@@
  #include "print-tree.h"
  #include "volumes.h"
  #include "async-thread.h"
+ #include "check-integrity.h"
  
  static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
@@@ -247,7 -247,7 +248,7 @@@ loop_lock
                        sync_pending = 0;
                }
  
-               submit_bio(cur->bi_rw, cur);
+               btrfsic_submit_bio(cur->bi_rw, cur);
                num_run++;
                batch_run++;
                if (need_resched())
@@@ -830,6 -830,7 +831,6 @@@ out
  
  /*
   * find_free_dev_extent - find free space in the specified device
 - * @trans:    transaction handler
   * @device:   the device which we search the free space in
   * @num_bytes:        the size of the free space that we need
   * @start:    store the start of the free space.
   * But if we don't find suitable free space, it is used to store the size of
   * the max free space.
   */
 -int find_free_dev_extent(struct btrfs_trans_handle *trans,
 -                       struct btrfs_device *device, u64 num_bytes,
 +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
                         u64 *start, u64 *len)
  {
        struct btrfs_key key;
        key.offset = search_start;
        key.type = BTRFS_DEV_EXTENT_KEY;
  
 -      ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
 +      ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
        if (ret > 0) {
@@@ -1281,6 -1283,7 +1282,6 @@@ int btrfs_rm_device(struct btrfs_root *
        bool clear_super = false;
  
        mutex_lock(&uuid_mutex);
 -      mutex_lock(&root->fs_info->volume_mutex);
  
        all_avail = root->fs_info->avail_data_alloc_bits |
                root->fs_info->avail_system_alloc_bits |
@@@ -1450,6 -1453,7 +1451,6 @@@ error_close
        if (bdev)
                blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
  out:
 -      mutex_unlock(&root->fs_info->volume_mutex);
        mutex_unlock(&uuid_mutex);
        return ret;
  error_undo:
  /*
   * does all the dirty work required for changing file system's UUID.
   */
 -static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
 -                              struct btrfs_root *root)
 +static int btrfs_prepare_sprout(struct btrfs_root *root)
  {
        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
        struct btrfs_fs_devices *old_devices;
@@@ -1625,6 -1630,7 +1626,6 @@@ int btrfs_init_new_device(struct btrfs_
        }
  
        filemap_write_and_wait(bdev->bd_inode->i_mapping);
 -      mutex_lock(&root->fs_info->volume_mutex);
  
        devices = &root->fs_info->fs_devices->devices;
        /*
  
        if (seeding_dev) {
                sb->s_flags &= ~MS_RDONLY;
 -              ret = btrfs_prepare_sprout(trans, root);
 +              ret = btrfs_prepare_sprout(root);
                BUG_ON(ret);
        }
  
                ret = btrfs_relocate_sys_chunks(root);
                BUG_ON(ret);
        }
 -out:
 -      mutex_unlock(&root->fs_info->volume_mutex);
 +
        return ret;
  error:
        blkdev_put(bdev, FMODE_EXCL);
                mutex_unlock(&uuid_mutex);
                up_write(&sb->s_umount);
        }
 -      goto out;
 +      return ret;
  }
  
  static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@@ -2071,362 -2078,6 +2072,362 @@@ error
        return ret;
  }
  
 +static int insert_balance_item(struct btrfs_root *root,
 +                             struct btrfs_balance_control *bctl)
 +{
 +      struct btrfs_trans_handle *trans;
 +      struct btrfs_balance_item *item;
 +      struct btrfs_disk_balance_args disk_bargs;
 +      struct btrfs_path *path;
 +      struct extent_buffer *leaf;
 +      struct btrfs_key key;
 +      int ret, err;
 +
 +      path = btrfs_alloc_path();
 +      if (!path)
 +              return -ENOMEM;
 +
 +      trans = btrfs_start_transaction(root, 0);
 +      if (IS_ERR(trans)) {
 +              btrfs_free_path(path);
 +              return PTR_ERR(trans);
 +      }
 +
 +      key.objectid = BTRFS_BALANCE_OBJECTID;
 +      key.type = BTRFS_BALANCE_ITEM_KEY;
 +      key.offset = 0;
 +
 +      ret = btrfs_insert_empty_item(trans, root, path, &key,
 +                                    sizeof(*item));
 +      if (ret)
 +              goto out;
 +
 +      leaf = path->nodes[0];
 +      item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
 +
 +      memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
 +
 +      btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
 +      btrfs_set_balance_data(leaf, item, &disk_bargs);
 +      btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
 +      btrfs_set_balance_meta(leaf, item, &disk_bargs);
 +      btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
 +      btrfs_set_balance_sys(leaf, item, &disk_bargs);
 +
 +      btrfs_set_balance_flags(leaf, item, bctl->flags);
 +
 +      btrfs_mark_buffer_dirty(leaf);
 +out:
 +      btrfs_free_path(path);
 +      err = btrfs_commit_transaction(trans, root);
 +      if (err && !ret)
 +              ret = err;
 +      return ret;
 +}
 +
 +static int del_balance_item(struct btrfs_root *root)
 +{
 +      struct btrfs_trans_handle *trans;
 +      struct btrfs_path *path;
 +      struct btrfs_key key;
 +      int ret, err;
 +
 +      path = btrfs_alloc_path();
 +      if (!path)
 +              return -ENOMEM;
 +
 +      trans = btrfs_start_transaction(root, 0);
 +      if (IS_ERR(trans)) {
 +              btrfs_free_path(path);
 +              return PTR_ERR(trans);
 +      }
 +
 +      key.objectid = BTRFS_BALANCE_OBJECTID;
 +      key.type = BTRFS_BALANCE_ITEM_KEY;
 +      key.offset = 0;
 +
 +      ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 +      if (ret < 0)
 +              goto out;
 +      if (ret > 0) {
 +              ret = -ENOENT;
 +              goto out;
 +      }
 +
 +      ret = btrfs_del_item(trans, root, path);
 +out:
 +      btrfs_free_path(path);
 +      err = btrfs_commit_transaction(trans, root);
 +      if (err && !ret)
 +              ret = err;
 +      return ret;
 +}
 +
 +/*
 + * This is a heuristic used to reduce the number of chunks balanced on
 + * resume after balance was interrupted.
 + */
 +static void update_balance_args(struct btrfs_balance_control *bctl)
 +{
 +      /*
 +       * Turn on soft mode for chunk types that were being converted.
 +       */
 +      if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
 +              bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
 +      if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
 +              bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
 +      if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
 +              bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
 +
 +      /*
 +       * Turn on usage filter if is not already used.  The idea is
 +       * that chunks that we have already balanced should be
 +       * reasonably full.  Don't do it for chunks that are being
 +       * converted - that will keep us from relocating unconverted
 +       * (albeit full) chunks.
 +       */
 +      if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
 +          !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
 +              bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
 +              bctl->data.usage = 90;
 +      }
 +      if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
 +          !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
 +              bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
 +              bctl->sys.usage = 90;
 +      }
 +      if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
 +          !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
 +              bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
 +              bctl->meta.usage = 90;
 +      }
 +}
 +
 +/*
 + * Should be called with both balance and volume mutexes held to
 + * serialize other volume operations (add_dev/rm_dev/resize) with
 + * restriper.  Same goes for unset_balance_control.
 + */
 +static void set_balance_control(struct btrfs_balance_control *bctl)
 +{
 +      struct btrfs_fs_info *fs_info = bctl->fs_info;
 +
 +      BUG_ON(fs_info->balance_ctl);
 +
 +      spin_lock(&fs_info->balance_lock);
 +      fs_info->balance_ctl = bctl;
 +      spin_unlock(&fs_info->balance_lock);
 +}
 +
 +static void unset_balance_control(struct btrfs_fs_info *fs_info)
 +{
 +      struct btrfs_balance_control *bctl = fs_info->balance_ctl;
 +
 +      BUG_ON(!fs_info->balance_ctl);
 +
 +      spin_lock(&fs_info->balance_lock);
 +      fs_info->balance_ctl = NULL;
 +      spin_unlock(&fs_info->balance_lock);
 +
 +      kfree(bctl);
 +}
 +
 +/*
 + * Balance filters.  Return 1 if chunk should be filtered out
 + * (should not be balanced).
 + */
 +static int chunk_profiles_filter(u64 chunk_profile,
 +                               struct btrfs_balance_args *bargs)
 +{
 +      chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
 +
 +      if (chunk_profile == 0)
 +              chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
 +
 +      if (bargs->profiles & chunk_profile)
 +              return 0;
 +
 +      return 1;
 +}
 +
 +static u64 div_factor_fine(u64 num, int factor)
 +{
 +      if (factor <= 0)
 +              return 0;
 +      if (factor >= 100)
 +              return num;
 +
 +      num *= factor;
 +      do_div(num, 100);
 +      return num;
 +}
 +
 +static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
 +                            struct btrfs_balance_args *bargs)
 +{
 +      struct btrfs_block_group_cache *cache;
 +      u64 chunk_used, user_thresh;
 +      int ret = 1;
 +
 +      cache = btrfs_lookup_block_group(fs_info, chunk_offset);
 +      chunk_used = btrfs_block_group_used(&cache->item);
 +
 +      user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
 +      if (chunk_used < user_thresh)
 +              ret = 0;
 +
 +      btrfs_put_block_group(cache);
 +      return ret;
 +}
 +
 +static int chunk_devid_filter(struct extent_buffer *leaf,
 +                            struct btrfs_chunk *chunk,
 +                            struct btrfs_balance_args *bargs)
 +{
 +      struct btrfs_stripe *stripe;
 +      int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
 +      int i;
 +
 +      for (i = 0; i < num_stripes; i++) {
 +              stripe = btrfs_stripe_nr(chunk, i);
 +              if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
 +                      return 0;
 +      }
 +
 +      return 1;
 +}
 +
 +/* [pstart, pend) */
 +static int chunk_drange_filter(struct extent_buffer *leaf,
 +                             struct btrfs_chunk *chunk,
 +                             u64 chunk_offset,
 +                             struct btrfs_balance_args *bargs)
 +{
 +      struct btrfs_stripe *stripe;
 +      int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
 +      u64 stripe_offset;
 +      u64 stripe_length;
 +      int factor;
 +      int i;
 +
 +      if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
 +              return 0;
 +
 +      if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
 +           BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
 +              factor = 2;
 +      else
 +              factor = 1;
 +      factor = num_stripes / factor;
 +
 +      for (i = 0; i < num_stripes; i++) {
 +              stripe = btrfs_stripe_nr(chunk, i);
 +              if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
 +                      continue;
 +
 +              stripe_offset = btrfs_stripe_offset(leaf, stripe);
 +              stripe_length = btrfs_chunk_length(leaf, chunk);
 +              do_div(stripe_length, factor);
 +
 +              if (stripe_offset < bargs->pend &&
 +                  stripe_offset + stripe_length > bargs->pstart)
 +                      return 0;
 +      }
 +
 +      return 1;
 +}
 +
 +/* [vstart, vend) */
 +static int chunk_vrange_filter(struct extent_buffer *leaf,
 +                             struct btrfs_chunk *chunk,
 +                             u64 chunk_offset,
 +                             struct btrfs_balance_args *bargs)
 +{
 +      if (chunk_offset < bargs->vend &&
 +          chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
 +              /* at least part of the chunk is inside this vrange */
 +              return 0;
 +
 +      return 1;
 +}
 +
 +static int chunk_soft_convert_filter(u64 chunk_profile,
 +                                   struct btrfs_balance_args *bargs)
 +{
 +      if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
 +              return 0;
 +
 +      chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
 +
 +      if (chunk_profile == 0)
 +              chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
 +
 +      if (bargs->target & chunk_profile)
 +              return 1;
 +
 +      return 0;
 +}
 +
 +static int should_balance_chunk(struct btrfs_root *root,
 +                              struct extent_buffer *leaf,
 +                              struct btrfs_chunk *chunk, u64 chunk_offset)
 +{
 +      struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
 +      struct btrfs_balance_args *bargs = NULL;
 +      u64 chunk_type = btrfs_chunk_type(leaf, chunk);
 +
 +      /* type filter */
 +      if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
 +            (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
 +              return 0;
 +      }
 +
 +      if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
 +              bargs = &bctl->data;
 +      else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
 +              bargs = &bctl->sys;
 +      else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
 +              bargs = &bctl->meta;
 +
 +      /* profiles filter */
 +      if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
 +          chunk_profiles_filter(chunk_type, bargs)) {
 +              return 0;
 +      }
 +
 +      /* usage filter */
 +      if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
 +          chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
 +              return 0;
 +      }
 +
 +      /* devid filter */
 +      if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
 +          chunk_devid_filter(leaf, chunk, bargs)) {
 +              return 0;
 +      }
 +
 +      /* drange filter, makes sense only with devid filter */
 +      if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
 +          chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
 +              return 0;
 +      }
 +
 +      /* vrange filter */
 +      if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
 +          chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
 +              return 0;
 +      }
 +
 +      /* soft profile changing mode */
 +      if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
 +          chunk_soft_convert_filter(chunk_type, bargs)) {
 +              return 0;
 +      }
 +
 +      return 1;
 +}
 +
  static u64 div_factor(u64 num, int factor)
  {
        if (factor == 10)
        return num;
  }
  
 -int btrfs_balance(struct btrfs_root *dev_root)
 +static int __btrfs_balance(struct btrfs_fs_info *fs_info)
  {
 -      int ret;
 -      struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
 +      struct btrfs_balance_control *bctl = fs_info->balance_ctl;
 +      struct btrfs_root *chunk_root = fs_info->chunk_root;
 +      struct btrfs_root *dev_root = fs_info->dev_root;
 +      struct list_head *devices;
        struct btrfs_device *device;
        u64 old_size;
        u64 size_to_free;
 +      struct btrfs_chunk *chunk;
        struct btrfs_path *path;
        struct btrfs_key key;
 -      struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
 -      struct btrfs_trans_handle *trans;
        struct btrfs_key found_key;
 -
 -      if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
 -              return -EROFS;
 -
 -      if (!capable(CAP_SYS_ADMIN))
 -              return -EPERM;
 -
 -      mutex_lock(&dev_root->fs_info->volume_mutex);
 -      dev_root = dev_root->fs_info->dev_root;
 +      struct btrfs_trans_handle *trans;
 +      struct extent_buffer *leaf;
 +      int slot;
 +      int ret;
 +      int enospc_errors = 0;
 +      bool counting = true;
  
        /* step one make some room on all the devices */
 +      devices = &fs_info->fs_devices->devices;
        list_for_each_entry(device, devices, dev_list) {
                old_size = device->total_bytes;
                size_to_free = div_factor(old_size, 1);
                ret = -ENOMEM;
                goto error;
        }
 +
 +      /* zero out stat counters */
 +      spin_lock(&fs_info->balance_lock);
 +      memset(&bctl->stat, 0, sizeof(bctl->stat));
 +      spin_unlock(&fs_info->balance_lock);
 +again:
        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
        key.offset = (u64)-1;
        key.type = BTRFS_CHUNK_ITEM_KEY;
  
        while (1) {
 +              if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
 +                  atomic_read(&fs_info->balance_cancel_req)) {
 +                      ret = -ECANCELED;
 +                      goto error;
 +              }
 +
                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
                if (ret < 0)
                        goto error;
                 * failed
                 */
                if (ret == 0)
 -                      break;
 +                      BUG(); /* FIXME break ? */
  
                ret = btrfs_previous_item(chunk_root, path, 0,
                                          BTRFS_CHUNK_ITEM_KEY);
 -              if (ret)
 +              if (ret) {
 +                      ret = 0;
                        break;
 +              }
 +
 +              leaf = path->nodes[0];
 +              slot = path->slots[0];
 +              btrfs_item_key_to_cpu(leaf, &found_key, slot);
  
 -              btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 -                                    path->slots[0]);
                if (found_key.objectid != key.objectid)
                        break;
  
                if (found_key.offset == 0)
                        break;
  
 +              chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
 +
 +              if (!counting) {
 +                      spin_lock(&fs_info->balance_lock);
 +                      bctl->stat.considered++;
 +                      spin_unlock(&fs_info->balance_lock);
 +              }
 +
 +              ret = should_balance_chunk(chunk_root, leaf, chunk,
 +                                         found_key.offset);
                btrfs_release_path(path);
 +              if (!ret)
 +                      goto loop;
 +
 +              if (counting) {
 +                      spin_lock(&fs_info->balance_lock);
 +                      bctl->stat.expected++;
 +                      spin_unlock(&fs_info->balance_lock);
 +                      goto loop;
 +              }
 +
                ret = btrfs_relocate_chunk(chunk_root,
                                           chunk_root->root_key.objectid,
                                           found_key.objectid,
                                           found_key.offset);
                if (ret && ret != -ENOSPC)
                        goto error;
 +              if (ret == -ENOSPC) {
 +                      enospc_errors++;
 +              } else {
 +                      spin_lock(&fs_info->balance_lock);
 +                      bctl->stat.completed++;
 +                      spin_unlock(&fs_info->balance_lock);
 +              }
 +loop:
                key.offset = found_key.offset - 1;
        }
 -      ret = 0;
 +
 +      if (counting) {
 +              btrfs_release_path(path);
 +              counting = false;
 +              goto again;
 +      }
  error:
        btrfs_free_path(path);
 -      mutex_unlock(&dev_root->fs_info->volume_mutex);
 +      if (enospc_errors) {
 +              printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
 +                     enospc_errors);
 +              if (!ret)
 +                      ret = -ENOSPC;
 +      }
 +
 +      return ret;
 +}
 +
 +static inline int balance_need_close(struct btrfs_fs_info *fs_info)
 +{
 +      /* cancel requested || normal exit path */
 +      return atomic_read(&fs_info->balance_cancel_req) ||
 +              (atomic_read(&fs_info->balance_pause_req) == 0 &&
 +               atomic_read(&fs_info->balance_cancel_req) == 0);
 +}
 +
 +static void __cancel_balance(struct btrfs_fs_info *fs_info)
 +{
 +      int ret;
 +
 +      unset_balance_control(fs_info);
 +      ret = del_balance_item(fs_info->tree_root);
 +      BUG_ON(ret);
 +}
 +
 +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
 +                             struct btrfs_ioctl_balance_args *bargs);
 +
 +/*
 + * Should be called with both balance and volume mutexes held
 + */
 +int btrfs_balance(struct btrfs_balance_control *bctl,
 +                struct btrfs_ioctl_balance_args *bargs)
 +{
 +      struct btrfs_fs_info *fs_info = bctl->fs_info;
 +      u64 allowed;
 +      int ret;
 +
 +      if (btrfs_fs_closing(fs_info) ||
 +          atomic_read(&fs_info->balance_pause_req) ||
 +          atomic_read(&fs_info->balance_cancel_req)) {
 +              ret = -EINVAL;
 +              goto out;
 +      }
 +
 +      /*
 +       * In case of mixed groups both data and meta should be picked,
 +       * and identical options should be given for both of them.
 +       */
 +      allowed = btrfs_super_incompat_flags(fs_info->super_copy);
 +      if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
 +          (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
 +              if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
 +                  !(bctl->flags & BTRFS_BALANCE_METADATA) ||
 +                  memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
 +                      printk(KERN_ERR "btrfs: with mixed groups data and "
 +                             "metadata balance options must be the same\n");
 +                      ret = -EINVAL;
 +                      goto out;
 +              }
 +      }
 +
 +      /*
 +       * Profile changing sanity checks.  Skip them if a simple
 +       * balance is requested.
 +       */
 +      if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
 +            BTRFS_BALANCE_ARGS_CONVERT))
 +              goto do_balance;
 +
 +      allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
 +      if (fs_info->fs_devices->num_devices == 1)
 +              allowed |= BTRFS_BLOCK_GROUP_DUP;
 +      else if (fs_info->fs_devices->num_devices < 4)
 +              allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
 +      else
 +              allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
 +                              BTRFS_BLOCK_GROUP_RAID10);
 +
 +      if (!profile_is_valid(bctl->data.target, 1) ||
 +          bctl->data.target & ~allowed) {
 +              printk(KERN_ERR "btrfs: unable to start balance with target "
 +                     "data profile %llu\n",
 +                     (unsigned long long)bctl->data.target);
 +              ret = -EINVAL;
 +              goto out;
 +      }
 +      if (!profile_is_valid(bctl->meta.target, 1) ||
 +          bctl->meta.target & ~allowed) {
 +              printk(KERN_ERR "btrfs: unable to start balance with target "
 +                     "metadata profile %llu\n",
 +                     (unsigned long long)bctl->meta.target);
 +              ret = -EINVAL;
 +              goto out;
 +      }
 +      if (!profile_is_valid(bctl->sys.target, 1) ||
 +          bctl->sys.target & ~allowed) {
 +              printk(KERN_ERR "btrfs: unable to start balance with target "
 +                     "system profile %llu\n",
 +                     (unsigned long long)bctl->sys.target);
 +              ret = -EINVAL;
 +              goto out;
 +      }
 +
 +      if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
 +              printk(KERN_ERR "btrfs: dup for data is not allowed\n");
 +              ret = -EINVAL;
 +              goto out;
 +      }
 +
 +      /* allow to reduce meta or sys integrity only if force set */
 +      allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
 +                      BTRFS_BLOCK_GROUP_RAID10;
 +      if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
 +           (fs_info->avail_system_alloc_bits & allowed) &&
 +           !(bctl->sys.target & allowed)) ||
 +          ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
 +           (fs_info->avail_metadata_alloc_bits & allowed) &&
 +           !(bctl->meta.target & allowed))) {
 +              if (bctl->flags & BTRFS_BALANCE_FORCE) {
 +                      printk(KERN_INFO "btrfs: force reducing metadata "
 +                             "integrity\n");
 +              } else {
 +                      printk(KERN_ERR "btrfs: balance will reduce metadata "
 +                             "integrity, use force if you want this\n");
 +                      ret = -EINVAL;
 +                      goto out;
 +              }
 +      }
 +
 +do_balance:
 +      ret = insert_balance_item(fs_info->tree_root, bctl);
 +      if (ret && ret != -EEXIST)
 +              goto out;
 +
 +      if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
 +              BUG_ON(ret == -EEXIST);
 +              set_balance_control(bctl);
 +      } else {
 +              BUG_ON(ret != -EEXIST);
 +              spin_lock(&fs_info->balance_lock);
 +              update_balance_args(bctl);
 +              spin_unlock(&fs_info->balance_lock);
 +      }
 +
 +      atomic_inc(&fs_info->balance_running);
 +      mutex_unlock(&fs_info->balance_mutex);
 +
 +      ret = __btrfs_balance(fs_info);
 +
 +      mutex_lock(&fs_info->balance_mutex);
 +      atomic_dec(&fs_info->balance_running);
 +
 +      if (bargs) {
 +              memset(bargs, 0, sizeof(*bargs));
 +              update_ioctl_balance_args(fs_info, 0, bargs);
 +      }
 +
 +      if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
 +          balance_need_close(fs_info)) {
 +              __cancel_balance(fs_info);
 +      }
 +
 +      wake_up(&fs_info->balance_wait_q);
 +
 +      return ret;
 +out:
 +      if (bctl->flags & BTRFS_BALANCE_RESUME)
 +              __cancel_balance(fs_info);
 +      else
 +              kfree(bctl);
 +      return ret;
 +}
 +
 +static int balance_kthread(void *data)
 +{
 +      struct btrfs_balance_control *bctl =
 +                      (struct btrfs_balance_control *)data;
 +      struct btrfs_fs_info *fs_info = bctl->fs_info;
 +      int ret = 0;
 +
 +      mutex_lock(&fs_info->volume_mutex);
 +      mutex_lock(&fs_info->balance_mutex);
 +
 +      set_balance_control(bctl);
 +
 +      if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
 +              printk(KERN_INFO "btrfs: force skipping balance\n");
 +      } else {
 +              printk(KERN_INFO "btrfs: continuing balance\n");
 +              ret = btrfs_balance(bctl, NULL);
 +      }
 +
 +      mutex_unlock(&fs_info->balance_mutex);
 +      mutex_unlock(&fs_info->volume_mutex);
 +      return ret;
 +}
 +
 +int btrfs_recover_balance(struct btrfs_root *tree_root)
 +{
 +      struct task_struct *tsk;
 +      struct btrfs_balance_control *bctl;
 +      struct btrfs_balance_item *item;
 +      struct btrfs_disk_balance_args disk_bargs;
 +      struct btrfs_path *path;
 +      struct extent_buffer *leaf;
 +      struct btrfs_key key;
 +      int ret;
 +
 +      path = btrfs_alloc_path();
 +      if (!path)
 +              return -ENOMEM;
 +
 +      bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
 +      if (!bctl) {
 +              ret = -ENOMEM;
 +              goto out;
 +      }
 +
 +      key.objectid = BTRFS_BALANCE_OBJECTID;
 +      key.type = BTRFS_BALANCE_ITEM_KEY;
 +      key.offset = 0;
 +
 +      ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
 +      if (ret < 0)
 +              goto out_bctl;
 +      if (ret > 0) { /* ret = -ENOENT; */
 +              ret = 0;
 +              goto out_bctl;
 +      }
 +
 +      leaf = path->nodes[0];
 +      item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
 +
 +      bctl->fs_info = tree_root->fs_info;
 +      bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
 +
 +      btrfs_balance_data(leaf, item, &disk_bargs);
 +      btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
 +      btrfs_balance_meta(leaf, item, &disk_bargs);
 +      btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
 +      btrfs_balance_sys(leaf, item, &disk_bargs);
 +      btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
 +
 +      tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
 +      if (IS_ERR(tsk))
 +              ret = PTR_ERR(tsk);
 +      else
 +              goto out;
 +
 +out_bctl:
 +      kfree(bctl);
 +out:
 +      btrfs_free_path(path);
        return ret;
  }
  
 +int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
 +{
 +      int ret = 0;
 +
 +      mutex_lock(&fs_info->balance_mutex);
 +      if (!fs_info->balance_ctl) {
 +              mutex_unlock(&fs_info->balance_mutex);
 +              return -ENOTCONN;
 +      }
 +
 +      if (atomic_read(&fs_info->balance_running)) {
 +              atomic_inc(&fs_info->balance_pause_req);
 +              mutex_unlock(&fs_info->balance_mutex);
 +
 +              wait_event(fs_info->balance_wait_q,
 +                         atomic_read(&fs_info->balance_running) == 0);
 +
 +              mutex_lock(&fs_info->balance_mutex);
 +              /* we are good with balance_ctl ripped off from under us */
 +              BUG_ON(atomic_read(&fs_info->balance_running));
 +              atomic_dec(&fs_info->balance_pause_req);
 +      } else {
 +              ret = -ENOTCONN;
 +      }
 +
 +      mutex_unlock(&fs_info->balance_mutex);
 +      return ret;
 +}
 +
 +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
 +{
 +      mutex_lock(&fs_info->balance_mutex);
 +      if (!fs_info->balance_ctl) {
 +              mutex_unlock(&fs_info->balance_mutex);
 +              return -ENOTCONN;
 +      }
 +
 +      atomic_inc(&fs_info->balance_cancel_req);
 +      /*
 +       * if we are running just wait and return, balance item is
 +       * deleted in btrfs_balance in this case
 +       */
 +      if (atomic_read(&fs_info->balance_running)) {
 +              mutex_unlock(&fs_info->balance_mutex);
 +              wait_event(fs_info->balance_wait_q,
 +                         atomic_read(&fs_info->balance_running) == 0);
 +              mutex_lock(&fs_info->balance_mutex);
 +      } else {
 +              /* __cancel_balance needs volume_mutex */
 +              mutex_unlock(&fs_info->balance_mutex);
 +              mutex_lock(&fs_info->volume_mutex);
 +              mutex_lock(&fs_info->balance_mutex);
 +
 +              if (fs_info->balance_ctl)
 +                      __cancel_balance(fs_info);
 +
 +              mutex_unlock(&fs_info->volume_mutex);
 +      }
 +
 +      BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
 +      atomic_dec(&fs_info->balance_cancel_req);
 +      mutex_unlock(&fs_info->balance_mutex);
 +      return 0;
 +}
 +
  /*
   * shrinking a device means finding all of the device extents past
   * the new size, and then following the back refs to the chunks.
@@@ -3041,7 -2324,8 +3042,7 @@@ done
        return ret;
  }
  
 -static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 -                         struct btrfs_root *root,
 +static int btrfs_add_system_chunk(struct btrfs_root *root,
                           struct btrfs_key *key,
                           struct btrfs_chunk *chunk, int item_size)
  {
@@@ -3158,11 -2442,7 +3159,11 @@@ static int __btrfs_alloc_chunk(struct b
                max_stripe_size = 1024 * 1024 * 1024;
                max_chunk_size = 10 * max_stripe_size;
        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
 -              max_stripe_size = 256 * 1024 * 1024;
 +              /* for larger filesystems, use larger metadata chunks */
 +              if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
 +                      max_stripe_size = 1024 * 1024 * 1024;
 +              else
 +                      max_stripe_size = 256 * 1024 * 1024;
                max_chunk_size = max_stripe_size;
        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
                max_stripe_size = 8 * 1024 * 1024;
                if (total_avail == 0)
                        continue;
  
 -              ret = find_free_dev_extent(trans, device,
 +              ret = find_free_dev_extent(device,
                                           max_stripe_size * dev_stripes,
                                           &dev_offset, &max_avail);
                if (ret && ret != -ENOSPC)
@@@ -3408,7 -2688,7 +3409,7 @@@ static int __finish_chunk_alloc(struct 
        BUG_ON(ret);
  
        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
 -              ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
 +              ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
                                             item_size);
                BUG_ON(ret);
        }
@@@ -3473,7 -2753,8 +3474,7 @@@ static noinline int init_first_rw_devic
                return ret;
  
        alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
 -                      (fs_info->metadata_alloc_profile &
 -                       fs_info->avail_metadata_alloc_bits);
 +                              fs_info->avail_metadata_alloc_bits;
        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
  
        ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
        sys_chunk_offset = chunk_offset + chunk_size;
  
        alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
 -                      (fs_info->system_alloc_profile &
 -                       fs_info->avail_system_alloc_bits);
 +                              fs_info->avail_system_alloc_bits;
        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
  
        ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
@@@ -3620,13 -2902,26 +3621,13 @@@ static int __btrfs_map_block(struct btr
        u64 stripe_nr;
        u64 stripe_nr_orig;
        u64 stripe_nr_end;
 -      int stripes_allocated = 8;
 -      int stripes_required = 1;
        int stripe_index;
        int i;
 +      int ret = 0;
        int num_stripes;
        int max_errors = 0;
        struct btrfs_bio *bbio = NULL;
  
 -      if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
 -              stripes_allocated = 1;
 -again:
 -      if (bbio_ret) {
 -              bbio = kzalloc(btrfs_bio_size(stripes_allocated),
 -                              GFP_NOFS);
 -              if (!bbio)
 -                      return -ENOMEM;
 -
 -              atomic_set(&bbio->error, 0);
 -      }
 -
        read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, logical, *length);
        read_unlock(&em_tree->lock);
        if (mirror_num > map->num_stripes)
                mirror_num = 0;
  
 -      /* if our btrfs_bio struct is too small, back off and try again */
 -      if (rw & REQ_WRITE) {
 -              if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
 -                               BTRFS_BLOCK_GROUP_DUP)) {
 -                      stripes_required = map->num_stripes;
 -                      max_errors = 1;
 -              } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 -                      stripes_required = map->sub_stripes;
 -                      max_errors = 1;
 -              }
 -      }
 -      if (rw & REQ_DISCARD) {
 -              if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
 -                               BTRFS_BLOCK_GROUP_RAID1 |
 -                               BTRFS_BLOCK_GROUP_DUP |
 -                               BTRFS_BLOCK_GROUP_RAID10)) {
 -                      stripes_required = map->num_stripes;
 -              }
 -      }
 -      if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
 -          stripes_allocated < stripes_required) {
 -              stripes_allocated = map->num_stripes;
 -              free_extent_map(em);
 -              kfree(bbio);
 -              goto again;
 -      }
        stripe_nr = offset;
        /*
         * stripe_nr counts the total number of stripes we have to stride
  
        if (rw & REQ_DISCARD)
                *length = min_t(u64, em->len - offset, *length);
 -      else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
 -                            BTRFS_BLOCK_GROUP_RAID1 |
 -                            BTRFS_BLOCK_GROUP_RAID10 |
 -                            BTRFS_BLOCK_GROUP_DUP)) {
 +      else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
                /* we limit the length of each bio to what fits in a stripe */
                *length = min_t(u64, em->len - offset,
                                map->stripe_len - stripe_offset);
        }
        BUG_ON(stripe_index >= map->num_stripes);
  
 +      bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
 +      if (!bbio) {
 +              ret = -ENOMEM;
 +              goto out;
 +      }
 +      atomic_set(&bbio->error, 0);
 +
        if (rw & REQ_DISCARD) {
 +              int factor = 0;
 +              int sub_stripes = 0;
 +              u64 stripes_per_dev = 0;
 +              u32 remaining_stripes = 0;
 +
 +              if (map->type &
 +                  (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
 +                      if (map->type & BTRFS_BLOCK_GROUP_RAID0)
 +                              sub_stripes = 1;
 +                      else
 +                              sub_stripes = map->sub_stripes;
 +
 +                      factor = map->num_stripes / sub_stripes;
 +                      stripes_per_dev = div_u64_rem(stripe_nr_end -
 +                                                    stripe_nr_orig,
 +                                                    factor,
 +                                                    &remaining_stripes);
 +              }
 +
                for (i = 0; i < num_stripes; i++) {
                        bbio->stripes[i].physical =
                                map->stripes[stripe_index].physical +
                                stripe_offset + stripe_nr * map->stripe_len;
                        bbio->stripes[i].dev = map->stripes[stripe_index].dev;
  
 -                      if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
 -                              u64 stripes;
 -                              u32 last_stripe = 0;
 -                              int j;
 -
 -                              div_u64_rem(stripe_nr_end - 1,
 -                                          map->num_stripes,
 -                                          &last_stripe);
 -
 -                              for (j = 0; j < map->num_stripes; j++) {
 -                                      u32 test;
 -
 -                                      div_u64_rem(stripe_nr_end - 1 - j,
 -                                                  map->num_stripes, &test);
 -                                      if (test == stripe_index)
 -                                              break;
 -                              }
 -                              stripes = stripe_nr_end - 1 - j;
 -                              do_div(stripes, map->num_stripes);
 -                              bbio->stripes[i].length = map->stripe_len *
 -                                      (stripes - stripe_nr + 1);
 -
 -                              if (i == 0) {
 -                                      bbio->stripes[i].length -=
 -                                              stripe_offset;
 -                                      stripe_offset = 0;
 -                              }
 -                              if (stripe_index == last_stripe)
 -                                      bbio->stripes[i].length -=
 -                                              stripe_end_offset;
 -                      } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 -                              u64 stripes;
 -                              int j;
 -                              int factor = map->num_stripes /
 -                                           map->sub_stripes;
 -                              u32 last_stripe = 0;
 -
 -                              div_u64_rem(stripe_nr_end - 1,
 -                                          factor, &last_stripe);
 -                              last_stripe *= map->sub_stripes;
 -
 -                              for (j = 0; j < factor; j++) {
 -                                      u32 test;
 -
 -                                      div_u64_rem(stripe_nr_end - 1 - j,
 -                                                  factor, &test);
 -
 -                                      if (test ==
 -                                          stripe_index / map->sub_stripes)
 -                                              break;
 -                              }
 -                              stripes = stripe_nr_end - 1 - j;
 -                              do_div(stripes, factor);
 -                              bbio->stripes[i].length = map->stripe_len *
 -                                      (stripes - stripe_nr + 1);
 -
 -                              if (i < map->sub_stripes) {
 +                      if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
 +                                       BTRFS_BLOCK_GROUP_RAID10)) {
 +                              bbio->stripes[i].length = stripes_per_dev *
 +                                                        map->stripe_len;
 +                              if (i / sub_stripes < remaining_stripes)
 +                                      bbio->stripes[i].length +=
 +                                              map->stripe_len;
 +                              if (i < sub_stripes)
                                        bbio->stripes[i].length -=
                                                stripe_offset;
 -                                      if (i == map->sub_stripes - 1)
 -                                              stripe_offset = 0;
 -                              }
 -                              if (stripe_index >= last_stripe &&
 -                                  stripe_index <= (last_stripe +
 -                                                   map->sub_stripes - 1)) {
 +                              if ((i / sub_stripes + 1) %
 +                                  sub_stripes == remaining_stripes)
                                        bbio->stripes[i].length -=
                                                stripe_end_offset;
 -                              }
 +                              if (i == sub_stripes - 1)
 +                                      stripe_offset = 0;
                        } else
                                bbio->stripes[i].length = *length;
  
                        stripe_index++;
                }
        }
 -      if (bbio_ret) {
 -              *bbio_ret = bbio;
 -              bbio->num_stripes = num_stripes;
 -              bbio->max_errors = max_errors;
 -              bbio->mirror_num = mirror_num;
 +
 +      if (rw & REQ_WRITE) {
 +              if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
 +                               BTRFS_BLOCK_GROUP_RAID10 |
 +                               BTRFS_BLOCK_GROUP_DUP)) {
 +                      max_errors = 1;
 +              }
        }
 +
 +      *bbio_ret = bbio;
 +      bbio->num_stripes = num_stripes;
 +      bbio->max_errors = max_errors;
 +      bbio->mirror_num = mirror_num;
  out:
        free_extent_map(em);
 -      return 0;
 +      return ret;
  }
  
  int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
@@@ -3962,7 -3305,7 +3963,7 @@@ static noinline int schedule_bio(struc
        /* don't bother with additional async steps for reads, right now */
        if (!(rw & REQ_WRITE)) {
                bio_get(bio);
-               submit_bio(rw, bio);
+               btrfsic_submit_bio(rw, bio);
                bio_put(bio);
                return 0;
        }
@@@ -4057,7 -3400,7 +4058,7 @@@ int btrfs_map_bio(struct btrfs_root *ro
                        if (async_submit)
                                schedule_bio(root, dev, rw, bio);
                        else
-                               submit_bio(rw, bio);
+                               btrfsic_submit_bio(rw, bio);
                } else {
                        bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
                        bio->bi_sector = logical >> 9;
@@@ -4226,7 -3569,7 +4227,7 @@@ static int open_seed_devices(struct btr
        struct btrfs_fs_devices *fs_devices;
        int ret;
  
 -      mutex_lock(&uuid_mutex);
 +      BUG_ON(!mutex_is_locked(&uuid_mutex));
  
        fs_devices = root->fs_info->fs_devices->seed;
        while (fs_devices) {
        fs_devices->seed = root->fs_info->fs_devices->seed;
        root->fs_info->fs_devices->seed = fs_devices;
  out:
 -      mutex_unlock(&uuid_mutex);
        return ret;
  }
  
@@@ -4406,9 -3750,6 +4407,9 @@@ int btrfs_read_chunk_tree(struct btrfs_
        if (!path)
                return -ENOMEM;
  
 +      mutex_lock(&uuid_mutex);
 +      lock_chunks(root);
 +
        /* first we search for all of the device items, and then we
         * read in all of the chunk items.  This way we can create chunk
         * mappings that reference all of the devices that are afound
@@@ -4459,9 -3800,6 +4460,9 @@@ again
        }
        ret = 0;
  error:
 +      unlock_chunks(root);
 +      mutex_unlock(&uuid_mutex);
 +
        btrfs_free_path(path);
        return ret;
  }