]> Pileus Git - ~andy/linux/commitdiff
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/josef/btrfs...
authorChris Mason <chris.mason@fusionio.com>
Wed, 20 Feb 2013 19:05:45 +0000 (14:05 -0500)
committerChris Mason <chris.mason@fusionio.com>
Wed, 20 Feb 2013 19:05:45 +0000 (14:05 -0500)
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Conflicts:
fs/btrfs/disk-io.c

38 files changed:
fs/btrfs/backref.h
fs/btrfs/btrfs_inode.h
fs/btrfs/check-integrity.c
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-inode.c
fs/btrfs/delayed-inode.h
fs/btrfs/delayed-ref.c
fs/btrfs/delayed-ref.h
fs/btrfs/dev-replace.c
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/file-item.c
fs/btrfs/file.c
fs/btrfs/free-space-cache.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/locking.c
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h
fs/btrfs/print-tree.c
fs/btrfs/qgroup.c
fs/btrfs/relocation.c
fs/btrfs/scrub.c
fs/btrfs/send.c
fs/btrfs/send.h
fs/btrfs/super.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h
fs/btrfs/tree-defrag.c
fs/btrfs/tree-log.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h
include/linux/btrfs.h [new file with mode: 0644]
include/uapi/linux/Kbuild
include/uapi/linux/btrfs.h [moved from fs/btrfs/ioctl.h with 96% similarity]

index d61feca79455bda94308c9ab3608b23409a84c73..310a7f6d09b1716c6e1658d77a6f2b1ab42e0536 100644 (file)
@@ -19,7 +19,7 @@
 #ifndef __BTRFS_BACKREF__
 #define __BTRFS_BACKREF__
 
-#include "ioctl.h"
+#include <linux/btrfs.h>
 #include "ulist.h"
 #include "extent_io.h"
 
index 2a8c242bc4f5486f8173bbc0c1f41eeef92b81e7..d9b97d4960e654754c3e976c101618d8fc5668b0 100644 (file)
@@ -40,6 +40,8 @@
 #define BTRFS_INODE_HAS_ASYNC_EXTENT           6
 #define BTRFS_INODE_NEEDS_FULL_SYNC            7
 #define BTRFS_INODE_COPY_EVERYTHING            8
+#define BTRFS_INODE_IN_DELALLOC_LIST           9
+#define BTRFS_INODE_READDIO_NEED_LOCK          10
 
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -216,4 +218,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
        return 0;
 }
 
+/*
+ * Disable DIO read nolock optimization, so new dio readers will be forced
+ * to grab i_mutex. It is used to avoid the endless truncate due to
+ * nonlocked dio read.
+ */
+static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
+{
+       set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags);
+       smp_mb();
+}
+
+static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
+{
+       smp_mb__before_clear_bit();
+       clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+                 &BTRFS_I(inode)->runtime_flags);
+}
+
 #endif
index 11d47bfb62b418f6f4d5459d8c02a18b05c3731b..18af6f48781a1f31e1d41c23bb08a2e1b6ea12a7 100644 (file)
@@ -813,8 +813,7 @@ static int btrfsic_process_superblock_dev_mirror(
            (bh->b_data + (dev_bytenr & 4095));
 
        if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
-           strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
-                   sizeof(super_tmp->magic)) ||
+           super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) ||
            memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
            btrfs_super_nodesize(super_tmp) != state->metablock_size ||
            btrfs_super_leafsize(super_tmp) != state->metablock_size ||
index eea5da7a2b9aa38ce28ee9b55d1c8ff5c141fa16..ecd25a1b4e519562ff874112515052403877fbc4 100644 (file)
@@ -1138,6 +1138,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
                switch (tm->op) {
                case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
                        BUG_ON(tm->slot < n);
+                       /* Fallthrough */
                case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
                case MOD_LOG_KEY_REMOVE:
                        btrfs_set_node_key(eb, &tm->key, tm->slot);
@@ -1222,7 +1223,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
 
        __tree_mod_log_rewind(eb_rewin, time_seq, tm);
        WARN_ON(btrfs_header_nritems(eb_rewin) >
-               BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root));
+               BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
 
        return eb_rewin;
 }
@@ -1441,7 +1442,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
  */
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct extent_buffer *parent,
-                      int start_slot, int cache_only, u64 *last_ret,
+                      int start_slot, u64 *last_ret,
                       struct btrfs_key *progress)
 {
        struct extent_buffer *cur;
@@ -1461,8 +1462,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
        struct btrfs_disk_key disk_key;
 
        parent_level = btrfs_header_level(parent);
-       if (cache_only && parent_level != 1)
-               return 0;
 
        WARN_ON(trans->transaction != root->fs_info->running_transaction);
        WARN_ON(trans->transid != root->fs_info->generation);
@@ -1508,10 +1507,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                else
                        uptodate = 0;
                if (!cur || !uptodate) {
-                       if (cache_only) {
-                               free_extent_buffer(cur);
-                               continue;
-                       }
                        if (!cur) {
                                cur = read_tree_block(root, blocknr,
                                                         blocksize, gen);
@@ -4825,8 +4820,8 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 
 /*
  * A helper function to walk down the tree starting at min_key, and looking
- * for nodes or leaves that are either in cache or have a minimum
- * transaction id.  This is used by the btree defrag code, and tree logging
+ * for nodes or leaves that are have a minimum transaction id.
+ * This is used by the btree defrag code, and tree logging
  *
  * This does not cow, but it does stuff the starting key it finds back
  * into min_key, so you can call btrfs_search_slot with cow=1 on the
@@ -4847,7 +4842,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
  */
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
                         struct btrfs_key *max_key,
-                        struct btrfs_path *path, int cache_only,
+                        struct btrfs_path *path,
                         u64 min_trans)
 {
        struct extent_buffer *cur;
@@ -4887,15 +4882,12 @@ again:
                if (sret && slot > 0)
                        slot--;
                /*
-                * check this node pointer against the cache_only and
-                * min_trans parameters.  If it isn't in cache or is too
-                * old, skip to the next one.
+                * check this node pointer against the min_trans parameters.
+                * If it is too old, old, skip to the next one.
                 */
                while (slot < nritems) {
                        u64 blockptr;
                        u64 gen;
-                       struct extent_buffer *tmp;
-                       struct btrfs_disk_key disk_key;
 
                        blockptr = btrfs_node_blockptr(cur, slot);
                        gen = btrfs_node_ptr_generation(cur, slot);
@@ -4903,27 +4895,7 @@ again:
                                slot++;
                                continue;
                        }
-                       if (!cache_only)
-                               break;
-
-                       if (max_key) {
-                               btrfs_node_key(cur, &disk_key, slot);
-                               if (comp_keys(&disk_key, max_key) >= 0) {
-                                       ret = 1;
-                                       goto out;
-                               }
-                       }
-
-                       tmp = btrfs_find_tree_block(root, blockptr,
-                                           btrfs_level_size(root, level - 1));
-
-                       if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
-                               free_extent_buffer(tmp);
-                               break;
-                       }
-                       if (tmp)
-                               free_extent_buffer(tmp);
-                       slot++;
+                       break;
                }
 find_next_key:
                /*
@@ -4934,7 +4906,7 @@ find_next_key:
                        path->slots[level] = slot;
                        btrfs_set_path_blocking(path);
                        sret = btrfs_find_next_key(root, path, min_key, level,
-                                                 cache_only, min_trans);
+                                                 min_trans);
                        if (sret == 0) {
                                btrfs_release_path(path);
                                goto again;
@@ -5399,8 +5371,7 @@ out:
 /*
  * this is similar to btrfs_next_leaf, but does not try to preserve
  * and fixup the path.  It looks for and returns the next key in the
- * tree based on the current path and the cache_only and min_trans
- * parameters.
+ * tree based on the current path and the min_trans parameters.
  *
  * 0 is returned if another key is found, < 0 if there are any errors
  * and 1 is returned if there are no higher keys in the tree
@@ -5409,8 +5380,7 @@ out:
  * calling this function.
  */
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
-                       struct btrfs_key *key, int level,
-                       int cache_only, u64 min_trans)
+                       struct btrfs_key *key, int level, u64 min_trans)
 {
        int slot;
        struct extent_buffer *c;
@@ -5461,22 +5431,8 @@ next:
                if (level == 0)
                        btrfs_item_key_to_cpu(c, key, slot);
                else {
-                       u64 blockptr = btrfs_node_blockptr(c, slot);
                        u64 gen = btrfs_node_ptr_generation(c, slot);
 
-                       if (cache_only) {
-                               struct extent_buffer *cur;
-                               cur = btrfs_find_tree_block(root, blockptr,
-                                           btrfs_level_size(root, level - 1));
-                               if (!cur ||
-                                   btrfs_buffer_uptodate(cur, gen, 1) <= 0) {
-                                       slot++;
-                                       if (cur)
-                                               free_extent_buffer(cur);
-                                       goto next;
-                               }
-                               free_extent_buffer(cur);
-                       }
                        if (gen < min_trans) {
                                slot++;
                                goto next;
index 547b7b05727f917dfc2bad6516f5c5b30ea12c68..1679051f4d3971f176e393fd93847f597d9e61e4 100644 (file)
 #include <trace/events/btrfs.h>
 #include <asm/kmap_types.h>
 #include <linux/pagemap.h>
+#include <linux/btrfs.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
-#include "ioctl.h"
 
 struct btrfs_trans_handle;
 struct btrfs_transaction;
@@ -46,7 +46,7 @@ extern struct kmem_cache *btrfs_path_cachep;
 extern struct kmem_cache *btrfs_free_space_cachep;
 struct btrfs_ordered_sum;
 
-#define BTRFS_MAGIC "_BHRfS_M"
+#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
 
 #define BTRFS_MAX_MIRRORS 3
 
@@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 /* ioprio of readahead is set to idle */
 #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
 
+#define BTRFS_DIRTY_METADATA_THRESH    (32 * 1024 * 1024)
+
 /*
  * The key defines the order in the tree, and so it also defines (optimal)
  * block layout.
@@ -336,7 +338,9 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 /*
  * File system states
  */
+#define BTRFS_FS_STATE_ERROR           0
 
+/* Super block flags */
 /* Errors detected */
 #define BTRFS_SUPER_FLAG_ERROR         (1ULL << 2)
 
@@ -953,7 +957,15 @@ struct btrfs_dev_replace_item {
 #define BTRFS_BLOCK_GROUP_DUP          (1ULL << 5)
 #define BTRFS_BLOCK_GROUP_RAID10       (1ULL << 6)
 #define BTRFS_BLOCK_GROUP_RESERVED     BTRFS_AVAIL_ALLOC_BIT_SINGLE
-#define BTRFS_NR_RAID_TYPES            5
+
+enum btrfs_raid_types {
+       BTRFS_RAID_RAID10,
+       BTRFS_RAID_RAID1,
+       BTRFS_RAID_DUP,
+       BTRFS_RAID_RAID0,
+       BTRFS_RAID_SINGLE,
+       BTRFS_NR_RAID_TYPES
+};
 
 #define BTRFS_BLOCK_GROUP_TYPE_MASK    (BTRFS_BLOCK_GROUP_DATA |    \
                                         BTRFS_BLOCK_GROUP_SYSTEM |  \
@@ -1225,6 +1237,11 @@ struct seq_list {
        u64 seq;
 };
 
+enum btrfs_orphan_cleanup_state {
+       ORPHAN_CLEANUP_STARTED  = 1,
+       ORPHAN_CLEANUP_DONE     = 2,
+};
+
 /* fs_info */
 struct reloc_control;
 struct btrfs_device;
@@ -1250,6 +1267,7 @@ struct btrfs_fs_info {
 
        /* block group cache stuff */
        spinlock_t block_group_cache_lock;
+       u64 first_logical_byte;
        struct rb_root block_group_cache_tree;
 
        /* keep track of unallocated space */
@@ -1288,7 +1306,23 @@ struct btrfs_fs_info {
        u64 last_trans_log_full_commit;
        unsigned long mount_opt;
        unsigned long compress_type:4;
+       /*
+        * It is a suggestive number, the read side is safe even it gets a
+        * wrong number because we will write out the data into a regular
+        * extent. The write side(mount/remount) is under ->s_umount lock,
+        * so it is also safe.
+        */
        u64 max_inline;
+       /*
+        * Protected by ->chunk_mutex and sb->s_umount.
+        *
+        * The reason that we use two lock to protect it is because only
+        * remount and mount operations can change it and these two operations
+        * are under sb->s_umount, but the read side (chunk allocation) can not
+        * acquire sb->s_umount or the deadlock would happen. So we use two
+        * locks to protect it. On the write side, we must acquire two locks,
+        * and on the read side, we just need acquire one of them.
+        */
        u64 alloc_start;
        struct btrfs_transaction *running_transaction;
        wait_queue_head_t transaction_throttle;
@@ -1365,6 +1399,7 @@ struct btrfs_fs_info {
         */
        struct list_head ordered_extents;
 
+       spinlock_t delalloc_lock;
        /*
         * all of the inodes that have delalloc bytes.  It is possible for
         * this list to be empty even when there is still dirty data=ordered
@@ -1372,13 +1407,6 @@ struct btrfs_fs_info {
         */
        struct list_head delalloc_inodes;
 
-       /*
-        * special rename and truncate targets that must be on disk before
-        * we're allowed to commit.  This is basically the ext3 style
-        * data=ordered list.
-        */
-       struct list_head ordered_operations;
-
        /*
         * there is a pool of worker threads for checksumming during writes
         * and a pool for checksumming after reads.  This is because readers
@@ -1423,10 +1451,12 @@ struct btrfs_fs_info {
 
        u64 total_pinned;
 
-       /* protected by the delalloc lock, used to keep from writing
-        * metadata until there is a nice batch
-        */
-       u64 dirty_metadata_bytes;
+       /* used to keep from writing metadata until there is a nice batch */
+       struct percpu_counter dirty_metadata_bytes;
+       struct percpu_counter delalloc_bytes;
+       s32 dirty_metadata_batch;
+       s32 delalloc_batch;
+
        struct list_head dirty_cowonly_roots;
 
        struct btrfs_fs_devices *fs_devices;
@@ -1442,9 +1472,6 @@ struct btrfs_fs_info {
 
        struct reloc_control *reloc_ctl;
 
-       spinlock_t delalloc_lock;
-       u64 delalloc_bytes;
-
        /* data_alloc_cluster is only used in ssd mode */
        struct btrfs_free_cluster data_alloc_cluster;
 
@@ -1456,6 +1483,8 @@ struct btrfs_fs_info {
        struct rb_root defrag_inodes;
        atomic_t defrag_running;
 
+       /* Used to protect avail_{data, metadata, system}_alloc_bits */
+       seqlock_t profiles_lock;
        /*
         * these three are in extended format (availability of single
         * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1520,7 +1549,7 @@ struct btrfs_fs_info {
        u64 qgroup_seq;
 
        /* filesystem state */
-       u64 fs_state;
+       unsigned long fs_state;
 
        struct btrfs_delayed_root *delayed_root;
 
@@ -1623,6 +1652,9 @@ struct btrfs_root {
 
        struct list_head root_list;
 
+       spinlock_t log_extents_lock[2];
+       struct list_head logged_list[2];
+
        spinlock_t orphan_lock;
        atomic_t orphan_inodes;
        struct btrfs_block_rsv *orphan_block_rsv;
@@ -2936,8 +2968,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_pin_extent(struct btrfs_root *root,
                     u64 bytenr, u64 num, int reserved);
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
-                                   struct btrfs_root *root,
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
                                    u64 bytenr, u64 num_bytes);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
@@ -3092,10 +3123,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
                        struct btrfs_key *key, int lowest_level,
-                       int cache_only, u64 min_trans);
+                       u64 min_trans);
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
                         struct btrfs_key *max_key,
-                        struct btrfs_path *path, int cache_only,
+                        struct btrfs_path *path,
                         u64 min_trans);
 enum btrfs_compare_tree_result {
        BTRFS_COMPARE_TREE_NEW,
@@ -3148,7 +3179,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,
                               int find_higher, int return_any);
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct extent_buffer *parent,
-                      int start_slot, int cache_only, u64 *last_ret,
+                      int start_slot, u64 *last_ret,
                       struct btrfs_key *progress);
 void btrfs_release_path(struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
@@ -3543,7 +3574,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
 
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root, int cache_only);
+                       struct btrfs_root *root);
 
 /* sysfs.c */
 int btrfs_init_sysfs(void);
@@ -3620,11 +3651,14 @@ __printf(5, 6)
 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
                   unsigned int line, int errno, const char *fmt, ...);
 
+/*
+ * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ * will panic().  Otherwise we BUG() here.
+ */
 #define btrfs_panic(fs_info, errno, fmt, args...)                      \
 do {                                                                   \
-       struct btrfs_fs_info *_i = (fs_info);                           \
-       __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args);      \
-       BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR));    \
+       __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
+       BUG();                                                          \
 } while (0)
 
 /* acl.c */
@@ -3745,4 +3779,11 @@ static inline int is_fstree(u64 rootid)
                return 1;
        return 0;
 }
+
+static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
+{
+       return signal_pending(current);
+}
+
+
 #endif
index 34836036f01bc5bec6fff8768e1365cf69d500c0..0b278b117cbe609611764cf45127f89a133eed5f 100644 (file)
@@ -875,7 +875,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
                                     struct btrfs_delayed_item *delayed_item)
 {
        struct extent_buffer *leaf;
-       struct btrfs_item *item;
        char *ptr;
        int ret;
 
@@ -886,7 +885,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
 
        leaf = path->nodes[0];
 
-       item = btrfs_item_nr(leaf, path->slots[0]);
        ptr = btrfs_item_ptr(leaf, path->slots[0], char);
 
        write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
@@ -1065,32 +1063,25 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
        }
 }
 
-static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
-                                     struct btrfs_root *root,
-                                     struct btrfs_path *path,
-                                     struct btrfs_delayed_node *node)
+static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       struct btrfs_path *path,
+                                       struct btrfs_delayed_node *node)
 {
        struct btrfs_key key;
        struct btrfs_inode_item *inode_item;
        struct extent_buffer *leaf;
        int ret;
 
-       mutex_lock(&node->mutex);
-       if (!node->inode_dirty) {
-               mutex_unlock(&node->mutex);
-               return 0;
-       }
-
        key.objectid = node->inode_id;
        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
        key.offset = 0;
+
        ret = btrfs_lookup_inode(trans, root, path, &key, 1);
        if (ret > 0) {
                btrfs_release_path(path);
-               mutex_unlock(&node->mutex);
                return -ENOENT;
        } else if (ret < 0) {
-               mutex_unlock(&node->mutex);
                return ret;
        }
 
@@ -1105,11 +1096,47 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 
        btrfs_delayed_inode_release_metadata(root, node);
        btrfs_release_delayed_inode(node);
-       mutex_unlock(&node->mutex);
 
        return 0;
 }
 
+static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+                                            struct btrfs_root *root,
+                                            struct btrfs_path *path,
+                                            struct btrfs_delayed_node *node)
+{
+       int ret;
+
+       mutex_lock(&node->mutex);
+       if (!node->inode_dirty) {
+               mutex_unlock(&node->mutex);
+               return 0;
+       }
+
+       ret = __btrfs_update_delayed_inode(trans, root, path, node);
+       mutex_unlock(&node->mutex);
+       return ret;
+}
+
+static inline int
+__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+                                  struct btrfs_path *path,
+                                  struct btrfs_delayed_node *node)
+{
+       int ret;
+
+       ret = btrfs_insert_delayed_items(trans, path, node->root, node);
+       if (ret)
+               return ret;
+
+       ret = btrfs_delete_delayed_items(trans, path, node->root, node);
+       if (ret)
+               return ret;
+
+       ret = btrfs_update_delayed_inode(trans, node->root, path, node);
+       return ret;
+}
+
 /*
  * Called when committing the transaction.
  * Returns 0 on success.
@@ -1119,7 +1146,6 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root, int nr)
 {
-       struct btrfs_root *curr_root = root;
        struct btrfs_delayed_root *delayed_root;
        struct btrfs_delayed_node *curr_node, *prev_node;
        struct btrfs_path *path;
@@ -1142,15 +1168,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
 
        curr_node = btrfs_first_delayed_node(delayed_root);
        while (curr_node && (!count || (count && nr--))) {
-               curr_root = curr_node->root;
-               ret = btrfs_insert_delayed_items(trans, path, curr_root,
-                                                curr_node);
-               if (!ret)
-                       ret = btrfs_delete_delayed_items(trans, path,
-                                               curr_root, curr_node);
-               if (!ret)
-                       ret = btrfs_update_delayed_inode(trans, curr_root,
-                                               path, curr_node);
+               ret = __btrfs_commit_inode_delayed_items(trans, path,
+                                                        curr_node);
                if (ret) {
                        btrfs_release_delayed_node(curr_node);
                        curr_node = NULL;
@@ -1183,51 +1202,93 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
        return __btrfs_run_delayed_items(trans, root, nr);
 }
 
-static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
-                                             struct btrfs_delayed_node *node)
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+                                    struct inode *inode)
 {
+       struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
        struct btrfs_path *path;
        struct btrfs_block_rsv *block_rsv;
        int ret;
 
+       if (!delayed_node)
+               return 0;
+
+       mutex_lock(&delayed_node->mutex);
+       if (!delayed_node->count) {
+               mutex_unlock(&delayed_node->mutex);
+               btrfs_release_delayed_node(delayed_node);
+               return 0;
+       }
+       mutex_unlock(&delayed_node->mutex);
+
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        path->leave_spinning = 1;
 
        block_rsv = trans->block_rsv;
-       trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
+       trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
 
-       ret = btrfs_insert_delayed_items(trans, path, node->root, node);
-       if (!ret)
-               ret = btrfs_delete_delayed_items(trans, path, node->root, node);
-       if (!ret)
-               ret = btrfs_update_delayed_inode(trans, node->root, path, node);
-       btrfs_free_path(path);
+       ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
 
+       btrfs_release_delayed_node(delayed_node);
+       btrfs_free_path(path);
        trans->block_rsv = block_rsv;
+
        return ret;
 }
 
-int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
-                                    struct inode *inode)
+int btrfs_commit_inode_delayed_inode(struct inode *inode)
 {
+       struct btrfs_trans_handle *trans;
        struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+       struct btrfs_path *path;
+       struct btrfs_block_rsv *block_rsv;
        int ret;
 
        if (!delayed_node)
                return 0;
 
        mutex_lock(&delayed_node->mutex);
-       if (!delayed_node->count) {
+       if (!delayed_node->inode_dirty) {
                mutex_unlock(&delayed_node->mutex);
                btrfs_release_delayed_node(delayed_node);
                return 0;
        }
        mutex_unlock(&delayed_node->mutex);
 
-       ret = __btrfs_commit_inode_delayed_items(trans, delayed_node);
+       trans = btrfs_join_transaction(delayed_node->root);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out;
+       }
+
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto trans_out;
+       }
+       path->leave_spinning = 1;
+
+       block_rsv = trans->block_rsv;
+       trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
+
+       mutex_lock(&delayed_node->mutex);
+       if (delayed_node->inode_dirty)
+               ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
+                                                  path, delayed_node);
+       else
+               ret = 0;
+       mutex_unlock(&delayed_node->mutex);
+
+       btrfs_free_path(path);
+       trans->block_rsv = block_rsv;
+trans_out:
+       btrfs_end_transaction(trans, delayed_node->root);
+       btrfs_btree_balance_dirty(delayed_node->root);
+out:
        btrfs_release_delayed_node(delayed_node);
+
        return ret;
 }
 
@@ -1258,7 +1319,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
        struct btrfs_root *root;
        struct btrfs_block_rsv *block_rsv;
        int need_requeue = 0;
-       int ret;
 
        async_node = container_of(work, struct btrfs_async_delayed_node, work);
 
@@ -1277,14 +1337,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
        block_rsv = trans->block_rsv;
        trans->block_rsv = &root->fs_info->delayed_block_rsv;
 
-       ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
-       if (!ret)
-               ret = btrfs_delete_delayed_items(trans, path, root,
-                                                delayed_node);
-
-       if (!ret)
-               btrfs_update_delayed_inode(trans, root, path, delayed_node);
-
+       __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
        /*
         * Maybe new delayed items have been inserted, so we need requeue
         * the work. Besides that, we must dequeue the empty delayed nodes
index 4f808e1baeed06a3f71d1938159c57c291f55ea0..78b6ad0fc6699c5c59d5f9ccddef641aa80c5d09 100644 (file)
@@ -117,6 +117,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
 /* Used for evicting the inode. */
 void btrfs_remove_delayed_node(struct inode *inode);
 void btrfs_kill_delayed_inode_items(struct inode *inode);
+int btrfs_commit_inode_delayed_inode(struct inode *inode);
 
 
 int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
index ae94117733973e2d7aa8ea590c171fcf2e6bd26a..b7a0641ead7729bd25b21ad3f0d1a44b76799172 100644 (file)
 #include "delayed-ref.h"
 #include "transaction.h"
 
+struct kmem_cache *btrfs_delayed_ref_head_cachep;
+struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+struct kmem_cache *btrfs_delayed_data_ref_cachep;
+struct kmem_cache *btrfs_delayed_extent_op_cachep;
 /*
  * delayed back reference update tracking.  For subvolume trees
  * we queue up extent allocations and backref maintenance for
@@ -422,6 +426,14 @@ again:
        return 1;
 }
 
+void btrfs_release_ref_cluster(struct list_head *cluster)
+{
+       struct list_head *pos, *q;
+
+       list_for_each_safe(pos, q, cluster)
+               list_del_init(pos);
+}
+
 /*
  * helper function to update an extent delayed ref in the
  * rbtree.  existing and update must both have the same
@@ -511,7 +523,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
                                        ref->extent_op->flags_to_set;
                                existing_ref->extent_op->update_flags = 1;
                        }
-                       kfree(ref->extent_op);
+                       btrfs_free_delayed_extent_op(ref->extent_op);
                }
        }
        /*
@@ -592,7 +604,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
                 * we've updated the existing ref, free the newly
                 * allocated ref
                 */
-               kfree(head_ref);
+               kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
        } else {
                delayed_refs->num_heads++;
                delayed_refs->num_heads_ready++;
@@ -653,7 +665,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
                 * we've updated the existing ref, free the newly
                 * allocated ref
                 */
-               kfree(full_ref);
+               kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
        } else {
                delayed_refs->num_entries++;
                trans->delayed_ref_updates++;
@@ -714,7 +726,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
                 * we've updated the existing ref, free the newly
                 * allocated ref
                 */
-               kfree(full_ref);
+               kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
        } else {
                delayed_refs->num_entries++;
                trans->delayed_ref_updates++;
@@ -738,13 +750,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
        struct btrfs_delayed_ref_root *delayed_refs;
 
        BUG_ON(extent_op && extent_op->is_data);
-       ref = kmalloc(sizeof(*ref), GFP_NOFS);
+       ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
        if (!ref)
                return -ENOMEM;
 
-       head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+       head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
        if (!head_ref) {
-               kfree(ref);
+               kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
                return -ENOMEM;
        }
 
@@ -786,13 +798,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
        struct btrfs_delayed_ref_root *delayed_refs;
 
        BUG_ON(extent_op && !extent_op->is_data);
-       ref = kmalloc(sizeof(*ref), GFP_NOFS);
+       ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
        if (!ref)
                return -ENOMEM;
 
-       head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+       head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
        if (!head_ref) {
-               kfree(ref);
+               kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
                return -ENOMEM;
        }
 
@@ -826,7 +838,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
        struct btrfs_delayed_ref_head *head_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
 
-       head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+       head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
        if (!head_ref)
                return -ENOMEM;
 
@@ -860,3 +872,51 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
                return btrfs_delayed_node_to_head(ref);
        return NULL;
 }
+
+void btrfs_delayed_ref_exit(void)
+{
+       if (btrfs_delayed_ref_head_cachep)
+               kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
+       if (btrfs_delayed_tree_ref_cachep)
+               kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
+       if (btrfs_delayed_data_ref_cachep)
+               kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
+       if (btrfs_delayed_extent_op_cachep)
+               kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
+}
+
+int btrfs_delayed_ref_init(void)
+{
+       btrfs_delayed_ref_head_cachep = kmem_cache_create(
+                               "btrfs_delayed_ref_head",
+                               sizeof(struct btrfs_delayed_ref_head), 0,
+                               SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+       if (!btrfs_delayed_ref_head_cachep)
+               goto fail;
+
+       btrfs_delayed_tree_ref_cachep = kmem_cache_create(
+                               "btrfs_delayed_tree_ref",
+                               sizeof(struct btrfs_delayed_tree_ref), 0,
+                               SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+       if (!btrfs_delayed_tree_ref_cachep)
+               goto fail;
+
+       btrfs_delayed_data_ref_cachep = kmem_cache_create(
+                               "btrfs_delayed_data_ref",
+                               sizeof(struct btrfs_delayed_data_ref), 0,
+                               SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+       if (!btrfs_delayed_data_ref_cachep)
+               goto fail;
+
+       btrfs_delayed_extent_op_cachep = kmem_cache_create(
+                               "btrfs_delayed_extent_op",
+                               sizeof(struct btrfs_delayed_extent_op), 0,
+                               SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+       if (!btrfs_delayed_extent_op_cachep)
+               goto fail;
+
+       return 0;
+fail:
+       btrfs_delayed_ref_exit();
+       return -ENOMEM;
+}
index c9d703693df0b91e4c01a9181f93d2ab64572dcf..7939149f8f274c96813f7097a7825af018aec154 100644 (file)
@@ -141,12 +141,47 @@ struct btrfs_delayed_ref_root {
        u64 run_delayed_start;
 };
 
+extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
+extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
+
+int btrfs_delayed_ref_init(void);
+void btrfs_delayed_ref_exit(void);
+
+static inline struct btrfs_delayed_extent_op *
+btrfs_alloc_delayed_extent_op(void)
+{
+       return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS);
+}
+
+static inline void
+btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
+{
+       if (op)
+               kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
+}
+
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
 {
        WARN_ON(atomic_read(&ref->refs) == 0);
        if (atomic_dec_and_test(&ref->refs)) {
                WARN_ON(ref->in_tree);
-               kfree(ref);
+               switch (ref->type) {
+               case BTRFS_TREE_BLOCK_REF_KEY:
+               case BTRFS_SHARED_BLOCK_REF_KEY:
+                       kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+                       break;
+               case BTRFS_EXTENT_DATA_REF_KEY:
+               case BTRFS_SHARED_DATA_REF_KEY:
+                       kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+                       break;
+               case 0:
+                       kmem_cache_free(btrfs_delayed_ref_head_cachep, ref);
+                       break;
+               default:
+                       BUG();
+               }
        }
 }
 
@@ -176,8 +211,14 @@ struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
                           struct btrfs_delayed_ref_head *head);
+static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
+{
+       mutex_unlock(&head->mutex);
+}
+
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                           struct list_head *cluster, u64 search_start);
+void btrfs_release_ref_cluster(struct list_head *cluster);
 
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
                            struct btrfs_delayed_ref_root *delayed_refs,
index 66dbc8dbddf7584a7c4034203c896cf37fcecb39..7ba7b3900cb8eb749b82241cc4533b4fc978ef8a 100644 (file)
@@ -465,7 +465,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
         * flush all outstanding I/O and inode extent mappings before the
         * copy operation is declared as being finished
         */
-       btrfs_start_delalloc_inodes(root, 0);
+       ret = btrfs_start_delalloc_inodes(root, 0);
+       if (ret) {
+               mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+               return ret;
+       }
        btrfs_wait_ordered_extents(root, 0);
 
        trans = btrfs_start_transaction(root, 0);
index a8f652dc940bd85148dad48c11d2d893aefe32ba..779b401cd952526c6495fada08941611d304e7c8 100644 (file)
@@ -56,7 +56,8 @@ static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
                                    int read_only);
-static void btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+                                            struct btrfs_root *root);
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                                      struct btrfs_root *root);
@@ -420,7 +421,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
        struct extent_io_tree *tree;
-       u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+       u64 start = page_offset(page);
        u64 found_start;
        struct extent_buffer *eb;
 
@@ -946,18 +947,20 @@ static int btree_writepages(struct address_space *mapping,
                            struct writeback_control *wbc)
 {
        struct extent_io_tree *tree;
+       struct btrfs_fs_info *fs_info;
+       int ret;
+
        tree = &BTRFS_I(mapping->host)->io_tree;
        if (wbc->sync_mode == WB_SYNC_NONE) {
-               struct btrfs_root *root = BTRFS_I(mapping->host)->root;
-               u64 num_dirty;
-               unsigned long thresh = 32 * 1024 * 1024;
 
                if (wbc->for_kupdate)
                        return 0;
 
+               fs_info = BTRFS_I(mapping->host)->root->fs_info;
                /* this is a bit racy, but that's ok */
-               num_dirty = root->fs_info->dirty_metadata_bytes;
-               if (num_dirty < thresh)
+               ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
+                                            BTRFS_DIRTY_METADATA_THRESH);
+               if (ret < 0)
                        return 0;
        }
        return btree_write_cache_pages(mapping, wbc);
@@ -1125,24 +1128,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                      struct extent_buffer *buf)
 {
+       struct btrfs_fs_info *fs_info = root->fs_info;
+
        if (btrfs_header_generation(buf) ==
-           root->fs_info->running_transaction->transid) {
+           fs_info->running_transaction->transid) {
                btrfs_assert_tree_locked(buf);
 
                if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
-                       spin_lock(&root->fs_info->delalloc_lock);
-                       if (root->fs_info->dirty_metadata_bytes >= buf->len)
-                               root->fs_info->dirty_metadata_bytes -= buf->len;
-                       else {
-                               spin_unlock(&root->fs_info->delalloc_lock);
-                               btrfs_panic(root->fs_info, -EOVERFLOW,
-                                         "Can't clear %lu bytes from "
-                                         " dirty_mdatadata_bytes (%llu)",
-                                         buf->len,
-                                         root->fs_info->dirty_metadata_bytes);
-                       }
-                       spin_unlock(&root->fs_info->delalloc_lock);
-
+                       __percpu_counter_add(&fs_info->dirty_metadata_bytes,
+                                            -buf->len,
+                                            fs_info->dirty_metadata_batch);
                        /* ugh, clear_extent_buffer_dirty needs to lock the page */
                        btrfs_set_lock_blocking(buf);
                        clear_extent_buffer_dirty(buf);
@@ -1178,9 +1173,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->root_list);
+       INIT_LIST_HEAD(&root->logged_list[0]);
+       INIT_LIST_HEAD(&root->logged_list[1]);
        spin_lock_init(&root->orphan_lock);
        spin_lock_init(&root->inode_lock);
        spin_lock_init(&root->accounting_lock);
+       spin_lock_init(&root->log_extents_lock[0]);
+       spin_lock_init(&root->log_extents_lock[1]);
        mutex_init(&root->objectid_mutex);
        mutex_init(&root->log_mutex);
        init_waitqueue_head(&root->log_writer_wait);
@@ -2004,10 +2003,24 @@ int open_ctree(struct super_block *sb,
                goto fail_srcu;
        }
 
+       ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
+       if (ret) {
+               err = ret;
+               goto fail_bdi;
+       }
+       fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
+                                       (1 + ilog2(nr_cpu_ids));
+
+       ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
+       if (ret) {
+               err = ret;
+               goto fail_dirty_metadata_bytes;
+       }
+
        fs_info->btree_inode = new_inode(sb);
        if (!fs_info->btree_inode) {
                err = -ENOMEM;
-               goto fail_bdi;
+               goto fail_delalloc_bytes;
        }
 
        mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2017,7 +2030,6 @@ int open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->delayed_iputs);
        INIT_LIST_HEAD(&fs_info->delalloc_inodes);
-       INIT_LIST_HEAD(&fs_info->ordered_operations);
        INIT_LIST_HEAD(&fs_info->caching_block_groups);
        spin_lock_init(&fs_info->delalloc_lock);
        spin_lock_init(&fs_info->trans_lock);
@@ -2028,6 +2040,7 @@ int open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->tree_mod_seq_lock);
        rwlock_init(&fs_info->tree_mod_log_lock);
        mutex_init(&fs_info->reloc_mutex);
+       seqlock_init(&fs_info->profiles_lock);
 
        init_completion(&fs_info->kobj_unregister);
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2126,6 +2139,7 @@ int open_ctree(struct super_block *sb,
 
        spin_lock_init(&fs_info->block_group_cache_lock);
        fs_info->block_group_cache_tree = RB_ROOT;
+       fs_info->first_logical_byte = (u64)-1;
 
        extent_io_tree_init(&fs_info->freed_extents[0],
                             fs_info->btree_inode->i_mapping);
@@ -2187,7 +2201,8 @@ int open_ctree(struct super_block *sb,
                goto fail_alloc;
 
        /* check FS state, whether FS is broken. */
-       fs_info->fs_state |= btrfs_super_flags(disk_super);
+       if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
+               set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
 
        ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
        if (ret) {
@@ -2261,6 +2276,8 @@ int open_ctree(struct super_block *sb,
        leafsize = btrfs_super_leafsize(disk_super);
        sectorsize = btrfs_super_sectorsize(disk_super);
        stripesize = btrfs_super_stripesize(disk_super);
+       fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
+       fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
 
        /*
         * mixed block groups end up with duplicate but slightly offset
@@ -2390,8 +2407,7 @@ int open_ctree(struct super_block *sb,
        sb->s_blocksize = sectorsize;
        sb->s_blocksize_bits = blksize_bits(sectorsize);
 
-       if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-                   sizeof(disk_super->magic))) {
+       if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) {
                printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
                goto fail_sb_buffer;
        }
@@ -2694,13 +2710,13 @@ fail_cleaner:
         * kthreads
         */
        filemap_write_and_wait(fs_info->btree_inode->i_mapping);
-       invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 
 fail_block_groups:
        btrfs_free_block_groups(fs_info);
 
 fail_tree_roots:
        free_root_pointers(fs_info, 1);
+       invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 
 fail_sb_buffer:
        btrfs_stop_workers(&fs_info->generic_worker);
@@ -2721,8 +2737,11 @@ fail_alloc:
 fail_iput:
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
-       invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
        iput(fs_info->btree_inode);
+fail_delalloc_bytes:
+       percpu_counter_destroy(&fs_info->delalloc_bytes);
+fail_dirty_metadata_bytes:
+       percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
 fail_bdi:
        bdi_destroy(&fs_info->bdi);
 fail_srcu:
@@ -2795,8 +2814,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 
                super = (struct btrfs_super_block *)bh->b_data;
                if (btrfs_super_bytenr(super) != bytenr ||
-                   strncmp((char *)(&super->magic), BTRFS_MAGIC,
-                           sizeof(super->magic))) {
+                   super->magic != cpu_to_le64(BTRFS_MAGIC)) {
                        brelse(bh);
                        continue;
                }
@@ -3339,7 +3357,7 @@ int close_ctree(struct btrfs_root *root)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
 
-       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
                btrfs_error_commit_super(root);
 
        btrfs_put_block_group_cache(fs_info);
@@ -3352,9 +3370,9 @@ int close_ctree(struct btrfs_root *root)
 
        btrfs_free_qgroup_config(root->fs_info);
 
-       if (fs_info->delalloc_bytes) {
-               printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
-                      (unsigned long long)fs_info->delalloc_bytes);
+       if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
+               printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n",
+                      percpu_counter_sum(&fs_info->delalloc_bytes));
        }
 
        free_extent_buffer(fs_info->extent_root->node);
@@ -3401,6 +3419,8 @@ int close_ctree(struct btrfs_root *root)
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
+       percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+       percpu_counter_destroy(&fs_info->delalloc_bytes);
        bdi_destroy(&fs_info->bdi);
        cleanup_srcu_struct(&fs_info->subvol_srcu);
 
@@ -3443,11 +3463,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
                        (unsigned long long)transid,
                        (unsigned long long)root->fs_info->generation);
        was_dirty = set_extent_buffer_dirty(buf);
-       if (!was_dirty) {
-               spin_lock(&root->fs_info->delalloc_lock);
-               root->fs_info->dirty_metadata_bytes += buf->len;
-               spin_unlock(&root->fs_info->delalloc_lock);
-       }
+       if (!was_dirty)
+               __percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
+                                    buf->len,
+                                    root->fs_info->dirty_metadata_batch);
 }
 
 static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
@@ -3457,8 +3476,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
         * looks as though older kernels can get into trouble with
         * this code, they end up stuck in balance_dirty_pages forever
         */
-       u64 num_dirty;
-       unsigned long thresh = 32 * 1024 * 1024;
+       int ret;
 
        if (current->flags & PF_MEMALLOC)
                return;
@@ -3466,9 +3484,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
        if (flush_delayed)
                btrfs_balance_delayed_items(root);
 
-       num_dirty = root->fs_info->dirty_metadata_bytes;
-
-       if (num_dirty > thresh) {
+       ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
+                                    BTRFS_DIRTY_METADATA_THRESH);
+       if (ret > 0) {
                balance_dirty_pages_ratelimited(
                                   root->fs_info->btree_inode->i_mapping);
        }
@@ -3518,7 +3536,8 @@ void btrfs_error_commit_super(struct btrfs_root *root)
        btrfs_cleanup_transaction(root);
 }
 
-static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+                                            struct btrfs_root *root)
 {
        struct btrfs_inode *btrfs_inode;
        struct list_head splice;
@@ -3528,7 +3547,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
        mutex_lock(&root->fs_info->ordered_operations_mutex);
        spin_lock(&root->fs_info->ordered_extent_lock);
 
-       list_splice_init(&root->fs_info->ordered_operations, &splice);
+       list_splice_init(&t->ordered_operations, &splice);
        while (!list_empty(&splice)) {
                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
                                         ordered_operations);
@@ -3544,35 +3563,16 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
 
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
 {
-       struct list_head splice;
        struct btrfs_ordered_extent *ordered;
-       struct inode *inode;
-
-       INIT_LIST_HEAD(&splice);
 
        spin_lock(&root->fs_info->ordered_extent_lock);
-
-       list_splice_init(&root->fs_info->ordered_extents, &splice);
-       while (!list_empty(&splice)) {
-               ordered = list_entry(splice.next, struct btrfs_ordered_extent,
-                                    root_extent_list);
-
-               list_del_init(&ordered->root_extent_list);
-               atomic_inc(&ordered->refs);
-
-               /* the inode may be getting freed (in sys_unlink path). */
-               inode = igrab(ordered->inode);
-
-               spin_unlock(&root->fs_info->ordered_extent_lock);
-               if (inode)
-                       iput(inode);
-
-               atomic_set(&ordered->refs, 1);
-               btrfs_put_ordered_extent(ordered);
-
-               spin_lock(&root->fs_info->ordered_extent_lock);
-       }
-
+       /*
+        * This will just short circuit the ordered completion stuff which will
+        * make sure the ordered extent gets properly cleaned up.
+        */
+       list_for_each_entry(ordered, &root->fs_info->ordered_extents,
+                           root_extent_list)
+               set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
        spin_unlock(&root->fs_info->ordered_extent_lock);
 }
 
@@ -3594,11 +3594,11 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
        }
 
        while ((node = rb_first(&delayed_refs->root)) != NULL) {
-               ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+               struct btrfs_delayed_ref_head *head = NULL;
 
+               ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
                atomic_set(&ref->refs, 1);
                if (btrfs_delayed_ref_is_head(ref)) {
-                       struct btrfs_delayed_ref_head *head;
 
                        head = btrfs_delayed_node_to_head(ref);
                        if (!mutex_trylock(&head->mutex)) {
@@ -3614,16 +3614,18 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                                continue;
                        }
 
-                       kfree(head->extent_op);
+                       btrfs_free_delayed_extent_op(head->extent_op);
                        delayed_refs->num_heads--;
                        if (list_empty(&head->cluster))
                                delayed_refs->num_heads_ready--;
                        list_del_init(&head->cluster);
                }
+
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
-
+               if (head)
+                       mutex_unlock(&head->mutex);
                spin_unlock(&delayed_refs->lock);
                btrfs_put_delayed_ref(ref);
 
@@ -3671,6 +3673,8 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
                                    delalloc_inodes);
 
                list_del_init(&btrfs_inode->delalloc_inodes);
+               clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                         &btrfs_inode->runtime_flags);
 
                btrfs_invalidate_inodes(btrfs_inode->root);
        }
@@ -3823,10 +3827,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
 
        while (!list_empty(&list)) {
                t = list_entry(list.next, struct btrfs_transaction, list);
-               if (!t)
-                       break;
 
-               btrfs_destroy_ordered_operations(root);
+               btrfs_destroy_ordered_operations(t, root);
 
                btrfs_destroy_ordered_extents(root);
 
index 5a3327b8f90d557db144b360b3df4ec9bce9ff50..5cd44e239595f701c4681689780c82ce37f92aac 100644 (file)
@@ -72,8 +72,7 @@ enum {
        RESERVE_ALLOC_NO_ACCOUNT = 2,
 };
 
-static int update_block_group(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root,
+static int update_block_group(struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
@@ -103,6 +102,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups);
 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
                                       u64 num_bytes, int reserve);
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+                              u64 num_bytes);
 
 static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -162,6 +163,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
        rb_link_node(&block_group->cache_node, parent, p);
        rb_insert_color(&block_group->cache_node,
                        &info->block_group_cache_tree);
+
+       if (info->first_logical_byte > block_group->key.objectid)
+               info->first_logical_byte = block_group->key.objectid;
+
        spin_unlock(&info->block_group_cache_lock);
 
        return 0;
@@ -203,8 +208,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
                        break;
                }
        }
-       if (ret)
+       if (ret) {
                btrfs_get_block_group(ret);
+               if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
+                       info->first_logical_byte = ret->key.objectid;
+       }
        spin_unlock(&info->block_group_cache_lock);
 
        return ret;
@@ -468,8 +476,6 @@ out:
 }
 
 static int cache_block_group(struct btrfs_block_group_cache *cache,
-                            struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root,
                             int load_cache_only)
 {
        DEFINE_WAIT(wait);
@@ -527,12 +533,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        cache->cached = BTRFS_CACHE_FAST;
        spin_unlock(&cache->lock);
 
-       /*
-        * We can't do the read from on-disk cache during a commit since we need
-        * to have the normal tree locking.  Also if we are currently trying to
-        * allocate blocks for the tree root we can't do the fast caching since
-        * we likely hold important locks.
-        */
        if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
                ret = load_free_space_cache(fs_info, cache);
 
@@ -2143,7 +2143,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                                                      node->num_bytes);
                        }
                }
-               mutex_unlock(&head->mutex);
                return ret;
        }
 
@@ -2258,7 +2257,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                         * process of being added. Don't run this ref yet.
                         */
                        list_del_init(&locked_ref->cluster);
-                       mutex_unlock(&locked_ref->mutex);
+                       btrfs_delayed_ref_unlock(locked_ref);
                        locked_ref = NULL;
                        delayed_refs->num_heads_ready++;
                        spin_unlock(&delayed_refs->lock);
@@ -2285,7 +2284,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                        ref = &locked_ref->node;
 
                        if (extent_op && must_insert_reserved) {
-                               kfree(extent_op);
+                               btrfs_free_delayed_extent_op(extent_op);
                                extent_op = NULL;
                        }
 
@@ -2294,28 +2293,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 
                                ret = run_delayed_extent_op(trans, root,
                                                            ref, extent_op);
-                               kfree(extent_op);
+                               btrfs_free_delayed_extent_op(extent_op);
 
                                if (ret) {
-                                       list_del_init(&locked_ref->cluster);
-                                       mutex_unlock(&locked_ref->mutex);
-
-                                       printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
+                                       printk(KERN_DEBUG
+                                              "btrfs: run_delayed_extent_op "
+                                              "returned %d\n", ret);
                                        spin_lock(&delayed_refs->lock);
+                                       btrfs_delayed_ref_unlock(locked_ref);
                                        return ret;
                                }
 
                                goto next;
                        }
-
-                       list_del_init(&locked_ref->cluster);
-                       locked_ref = NULL;
                }
 
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
-               if (locked_ref) {
+               if (!btrfs_delayed_ref_is_head(ref)) {
                        /*
                         * when we play the delayed ref, also correct the
                         * ref_mod on head
@@ -2337,20 +2333,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                ret = run_one_delayed_ref(trans, root, ref, extent_op,
                                          must_insert_reserved);
 
-               btrfs_put_delayed_ref(ref);
-               kfree(extent_op);
-               count++;
-
+               btrfs_free_delayed_extent_op(extent_op);
                if (ret) {
-                       if (locked_ref) {
-                               list_del_init(&locked_ref->cluster);
-                               mutex_unlock(&locked_ref->mutex);
-                       }
-                       printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
+                       btrfs_delayed_ref_unlock(locked_ref);
+                       btrfs_put_delayed_ref(ref);
+                       printk(KERN_DEBUG
+                              "btrfs: run_one_delayed_ref returned %d\n", ret);
                        spin_lock(&delayed_refs->lock);
                        return ret;
                }
 
+               /*
+                * If this node is a head, that means all the refs in this head
+                * have been dealt with, and we will pick the next head to deal
+                * with, so we must unlock the head and drop it from the cluster
+                * list before we release it.
+                */
+               if (btrfs_delayed_ref_is_head(ref)) {
+                       list_del_init(&locked_ref->cluster);
+                       btrfs_delayed_ref_unlock(locked_ref);
+                       locked_ref = NULL;
+               }
+               btrfs_put_delayed_ref(ref);
+               count++;
 next:
                cond_resched();
                spin_lock(&delayed_refs->lock);
@@ -2500,6 +2505,7 @@ again:
 
                ret = run_clustered_refs(trans, root, &cluster);
                if (ret < 0) {
+                       btrfs_release_ref_cluster(&cluster);
                        spin_unlock(&delayed_refs->lock);
                        btrfs_abort_transaction(trans, root, ret);
                        return ret;
@@ -2586,7 +2592,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_extent_op *extent_op;
        int ret;
 
-       extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+       extent_op = btrfs_alloc_delayed_extent_op();
        if (!extent_op)
                return -ENOMEM;
 
@@ -2598,7 +2604,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
        ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
                                          num_bytes, extent_op);
        if (ret)
-               kfree(extent_op);
+               btrfs_free_delayed_extent_op(extent_op);
        return ret;
 }
 
@@ -3223,12 +3229,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
        u64 extra_flags = chunk_to_extended(flags) &
                                BTRFS_EXTENDED_PROFILE_MASK;
 
+       write_seqlock(&fs_info->profiles_lock);
        if (flags & BTRFS_BLOCK_GROUP_DATA)
                fs_info->avail_data_alloc_bits |= extra_flags;
        if (flags & BTRFS_BLOCK_GROUP_METADATA)
                fs_info->avail_metadata_alloc_bits |= extra_flags;
        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
                fs_info->avail_system_alloc_bits |= extra_flags;
+       write_sequnlock(&fs_info->profiles_lock);
 }
 
 /*
@@ -3320,12 +3328,18 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 
 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-       if (flags & BTRFS_BLOCK_GROUP_DATA)
-               flags |= root->fs_info->avail_data_alloc_bits;
-       else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-               flags |= root->fs_info->avail_system_alloc_bits;
-       else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-               flags |= root->fs_info->avail_metadata_alloc_bits;
+       unsigned seq;
+
+       do {
+               seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+               if (flags & BTRFS_BLOCK_GROUP_DATA)
+                       flags |= root->fs_info->avail_data_alloc_bits;
+               else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+                       flags |= root->fs_info->avail_system_alloc_bits;
+               else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+                       flags |= root->fs_info->avail_metadata_alloc_bits;
+       } while (read_seqretry(&root->fs_info->profiles_lock, seq));
 
        return btrfs_reduce_alloc_profile(root, flags);
 }
@@ -3564,6 +3578,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        int wait_for_alloc = 0;
        int ret = 0;
 
+       /* Don't re-enter if we're already allocating a chunk */
+       if (trans->allocating_chunk)
+               return -ENOSPC;
+
        space_info = __find_space_info(extent_root->fs_info, flags);
        if (!space_info) {
                ret = update_space_info(extent_root->fs_info, flags,
@@ -3606,6 +3624,8 @@ again:
                goto again;
        }
 
+       trans->allocating_chunk = true;
+
        /*
         * If we have mixed data/metadata chunks we want to make sure we keep
         * allocating mixed chunks instead of individual chunks.
@@ -3632,6 +3652,7 @@ again:
        check_system_chunk(trans, extent_root, flags);
 
        ret = btrfs_alloc_chunk(trans, extent_root, flags);
+       trans->allocating_chunk = false;
        if (ret < 0 && ret != -ENOSPC)
                goto out;
 
@@ -3653,13 +3674,31 @@ static int can_overcommit(struct btrfs_root *root,
                          struct btrfs_space_info *space_info, u64 bytes,
                          enum btrfs_reserve_flush_enum flush)
 {
+       struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
        u64 profile = btrfs_get_alloc_profile(root, 0);
+       u64 rsv_size = 0;
        u64 avail;
        u64 used;
+       u64 to_add;
 
        used = space_info->bytes_used + space_info->bytes_reserved +
-               space_info->bytes_pinned + space_info->bytes_readonly +
-               space_info->bytes_may_use;
+               space_info->bytes_pinned + space_info->bytes_readonly;
+
+       spin_lock(&global_rsv->lock);
+       rsv_size = global_rsv->size;
+       spin_unlock(&global_rsv->lock);
+
+       /*
+        * We only want to allow over committing if we have lots of actual space
+        * free, but if we don't have enough space to handle the global reserve
+        * space then we could end up having a real enospc problem when trying
+        * to allocate a chunk or some other such important allocation.
+        */
+       rsv_size <<= 1;
+       if (used + rsv_size >= space_info->total_bytes)
+               return 0;
+
+       used += space_info->bytes_may_use;
 
        spin_lock(&root->fs_info->free_chunk_lock);
        avail = root->fs_info->free_chunk_space;
@@ -3674,27 +3713,38 @@ static int can_overcommit(struct btrfs_root *root,
                       BTRFS_BLOCK_GROUP_RAID10))
                avail >>= 1;
 
+       to_add = space_info->total_bytes;
+
        /*
         * If we aren't flushing all things, let us overcommit up to
         * 1/2th of the space. If we can flush, don't let us overcommit
         * too much, let it overcommit up to 1/8 of the space.
         */
        if (flush == BTRFS_RESERVE_FLUSH_ALL)
-               avail >>= 3;
+               to_add >>= 3;
        else
-               avail >>= 1;
+               to_add >>= 1;
 
-       if (used + bytes < space_info->total_bytes + avail)
+       /*
+        * Limit the overcommit to the amount of free space we could possibly
+        * allocate for chunks.
+        */
+       to_add = min(avail, to_add);
+
+       if (used + bytes < space_info->total_bytes + to_add)
                return 1;
        return 0;
 }
 
-static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
-                                              unsigned long nr_pages,
-                                              enum wb_reason reason)
+static inline int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
+                                                     unsigned long nr_pages,
+                                                     enum wb_reason reason)
 {
-       if (!writeback_in_progress(sb->s_bdi) &&
-           down_read_trylock(&sb->s_umount)) {
+       /* the flusher is dealing with the dirty inodes now. */
+       if (writeback_in_progress(sb->s_bdi))
+               return 1;
+
+       if (down_read_trylock(&sb->s_umount)) {
                writeback_inodes_sb_nr(sb, nr_pages, reason);
                up_read(&sb->s_umount);
                return 1;
@@ -3703,6 +3753,28 @@ static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
        return 0;
 }
 
+void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
+                                 unsigned long nr_pages)
+{
+       struct super_block *sb = root->fs_info->sb;
+       int started;
+
+       /* If we can not start writeback, just sync all the delalloc file. */
+       started = writeback_inodes_sb_nr_if_idle_safe(sb, nr_pages,
+                                                     WB_REASON_FS_FREE_SPACE);
+       if (!started) {
+               /*
+                * We needn't worry the filesystem going from r/w to r/o though
+                * we don't acquire ->s_umount mutex, because the filesystem
+                * should guarantee the delalloc inodes list be empty after
+                * the filesystem is readonly(all dirty pages are written to
+                * the disk).
+                */
+               btrfs_start_delalloc_inodes(root, 0);
+               btrfs_wait_ordered_extents(root, 0);
+       }
+}
+
 /*
  * shrink metadata reservation for delalloc
  */
@@ -3724,7 +3796,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
        space_info = block_rsv->space_info;
 
        smp_mb();
-       delalloc_bytes = root->fs_info->delalloc_bytes;
+       delalloc_bytes = percpu_counter_sum_positive(
+                                               &root->fs_info->delalloc_bytes);
        if (delalloc_bytes == 0) {
                if (trans)
                        return;
@@ -3735,10 +3808,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
        while (delalloc_bytes && loops < 3) {
                max_reclaim = min(delalloc_bytes, to_reclaim);
                nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
-               writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
-                                                   nr_pages,
-                                                   WB_REASON_FS_FREE_SPACE);
-
+               btrfs_writeback_inodes_sb_nr(root, nr_pages);
                /*
                 * We need to wait for the async pages to actually start before
                 * we do anything.
@@ -3766,7 +3836,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
                                break;
                }
                smp_mb();
-               delalloc_bytes = root->fs_info->delalloc_bytes;
+               delalloc_bytes = percpu_counter_sum_positive(
+                                               &root->fs_info->delalloc_bytes);
        }
 }
 
@@ -4030,6 +4101,15 @@ again:
                goto again;
 
 out:
+       if (ret == -ENOSPC &&
+           unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
+               struct btrfs_block_rsv *global_rsv =
+                       &root->fs_info->global_block_rsv;
+
+               if (block_rsv != global_rsv &&
+                   !block_rsv_use_bytes(global_rsv, orig_bytes))
+                       ret = 0;
+       }
        if (flushing) {
                spin_lock(&space_info->lock);
                space_info->flush = 0;
@@ -4668,7 +4748,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        spin_lock(&BTRFS_I(inode)->lock);
        dropped = drop_outstanding_extent(inode);
 
-       to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+       if (num_bytes)
+               to_free = calc_csum_metadata_size(inode, num_bytes, 0);
        spin_unlock(&BTRFS_I(inode)->lock);
        if (dropped > 0)
                to_free += btrfs_calc_trans_metadata_size(root, dropped);
@@ -4735,8 +4816,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
        btrfs_free_reserved_data_space(inode, num_bytes);
 }
 
-static int update_block_group(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root,
+static int update_block_group(struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc)
 {
        struct btrfs_block_group_cache *cache = NULL;
@@ -4773,7 +4853,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                 * space back to the block group, otherwise we will leak space.
                 */
                if (!alloc && cache->cached == BTRFS_CACHE_NO)
-                       cache_block_group(cache, trans, NULL, 1);
+                       cache_block_group(cache, 1);
 
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
@@ -4823,6 +4903,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
        struct btrfs_block_group_cache *cache;
        u64 bytenr;
 
+       spin_lock(&root->fs_info->block_group_cache_lock);
+       bytenr = root->fs_info->first_logical_byte;
+       spin_unlock(&root->fs_info->block_group_cache_lock);
+
+       if (bytenr < (u64)-1)
+               return bytenr;
+
        cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
        if (!cache)
                return 0;
@@ -4873,8 +4960,7 @@ int btrfs_pin_extent(struct btrfs_root *root,
 /*
  * this function must be called within transaction
  */
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
-                                   struct btrfs_root *root,
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
                                    u64 bytenr, u64 num_bytes)
 {
        struct btrfs_block_group_cache *cache;
@@ -4888,7 +4974,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
         * to one because the slow code to read in the free extents does check
         * the pinned extents.
         */
-       cache_block_group(cache, trans, root, 1);
+       cache_block_group(cache, 1);
 
        pin_down_extent(root, cache, bytenr, num_bytes, 0);
 
@@ -5285,7 +5371,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        }
                }
 
-               ret = update_block_group(trans, root, bytenr, num_bytes, 0);
+               ret = update_block_group(root, bytenr, num_bytes, 0);
                if (ret) {
                        btrfs_abort_transaction(trans, extent_root, ret);
                        goto out;
@@ -5330,7 +5416,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        if (head->extent_op) {
                if (!head->must_insert_reserved)
                        goto out;
-               kfree(head->extent_op);
+               btrfs_free_delayed_extent_op(head->extent_op);
                head->extent_op = NULL;
        }
 
@@ -5476,7 +5562,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
                                u64 num_bytes)
 {
        struct btrfs_caching_control *caching_ctl;
-       DEFINE_WAIT(wait);
 
        caching_ctl = get_caching_control(cache);
        if (!caching_ctl)
@@ -5493,7 +5578,6 @@ static noinline int
 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 {
        struct btrfs_caching_control *caching_ctl;
-       DEFINE_WAIT(wait);
 
        caching_ctl = get_caching_control(cache);
        if (!caching_ctl)
@@ -5507,20 +5591,16 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 
 int __get_raid_index(u64 flags)
 {
-       int index;
-
        if (flags & BTRFS_BLOCK_GROUP_RAID10)
-               index = 0;
+               return BTRFS_RAID_RAID10;
        else if (flags & BTRFS_BLOCK_GROUP_RAID1)
-               index = 1;
+               return BTRFS_RAID_RAID1;
        else if (flags & BTRFS_BLOCK_GROUP_DUP)
-               index = 2;
+               return BTRFS_RAID_DUP;
        else if (flags & BTRFS_BLOCK_GROUP_RAID0)
-               index = 3;
+               return BTRFS_RAID_RAID0;
        else
-               index = 4;
-
-       return index;
+               return BTRFS_RAID_SINGLE;
 }
 
 static int get_block_group_index(struct btrfs_block_group_cache *cache)
@@ -5678,8 +5758,7 @@ have_block_group:
                cached = block_group_cache_done(block_group);
                if (unlikely(!cached)) {
                        found_uncached_bg = true;
-                       ret = cache_block_group(block_group, trans,
-                                               orig_root, 0);
+                       ret = cache_block_group(block_group, 0);
                        BUG_ON(ret < 0);
                        ret = 0;
                }
@@ -6108,7 +6187,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
 
-       ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+       ret = update_block_group(root, ins->objectid, ins->offset, 1);
        if (ret) { /* -ENOENT, logic error */
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
@@ -6172,7 +6251,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_free_path(path);
 
-       ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+       ret = update_block_group(root, ins->objectid, ins->offset, 1);
        if (ret) { /* -ENOENT, logic error */
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
@@ -6215,7 +6294,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
        u64 num_bytes = ins->offset;
 
        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-       cache_block_group(block_group, trans, NULL, 0);
+       cache_block_group(block_group, 0);
        caching_ctl = get_caching_control(block_group);
 
        if (!caching_ctl) {
@@ -6329,12 +6408,14 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        if (!ret)
                return block_rsv;
        if (ret && !block_rsv->failfast) {
-               static DEFINE_RATELIMIT_STATE(_rs,
-                               DEFAULT_RATELIMIT_INTERVAL,
-                               /*DEFAULT_RATELIMIT_BURST*/ 2);
-               if (__ratelimit(&_rs))
-                       WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
-                            ret);
+               if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+                       static DEFINE_RATELIMIT_STATE(_rs,
+                                       DEFAULT_RATELIMIT_INTERVAL * 10,
+                                       /*DEFAULT_RATELIMIT_BURST*/ 1);
+                       if (__ratelimit(&_rs))
+                               WARN(1, KERN_DEBUG
+                                       "btrfs: block rsv returned %d\n", ret);
+               }
                ret = reserve_metadata_bytes(root, block_rsv, blocksize,
                                             BTRFS_RESERVE_NO_FLUSH);
                if (!ret) {
@@ -6400,7 +6481,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 
        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
                struct btrfs_delayed_extent_op *extent_op;
-               extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+               extent_op = btrfs_alloc_delayed_extent_op();
                BUG_ON(!extent_op); /* -ENOMEM */
                if (key)
                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
@@ -7481,16 +7562,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                index = get_block_group_index(block_group);
        }
 
-       if (index == 0) {
+       if (index == BTRFS_RAID_RAID10) {
                dev_min = 4;
                /* Divide by 2 */
                min_free >>= 1;
-       } else if (index == 1) {
+       } else if (index == BTRFS_RAID_RAID1) {
                dev_min = 2;
-       } else if (index == 2) {
+       } else if (index == BTRFS_RAID_DUP) {
                /* Multiply by 2 */
                min_free <<= 1;
-       } else if (index == 3) {
+       } else if (index == BTRFS_RAID_RAID0) {
                dev_min = fs_devices->rw_devices;
                do_div(min_free, dev_min);
        }
@@ -7651,11 +7732,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                space_info = list_entry(info->space_info.next,
                                        struct btrfs_space_info,
                                        list);
-               if (space_info->bytes_pinned > 0 ||
-                   space_info->bytes_reserved > 0 ||
-                   space_info->bytes_may_use > 0) {
-                       WARN_ON(1);
-                       dump_space_info(space_info, 0, 0);
+               if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
+                       if (space_info->bytes_pinned > 0 ||
+                           space_info->bytes_reserved > 0 ||
+                           space_info->bytes_may_use > 0) {
+                               WARN_ON(1);
+                               dump_space_info(space_info, 0, 0);
+                       }
                }
                list_del(&space_info->list);
                kfree(space_info);
@@ -7932,12 +8015,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
        u64 extra_flags = chunk_to_extended(flags) &
                                BTRFS_EXTENDED_PROFILE_MASK;
 
+       write_seqlock(&fs_info->profiles_lock);
        if (flags & BTRFS_BLOCK_GROUP_DATA)
                fs_info->avail_data_alloc_bits &= ~extra_flags;
        if (flags & BTRFS_BLOCK_GROUP_METADATA)
                fs_info->avail_metadata_alloc_bits &= ~extra_flags;
        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
                fs_info->avail_system_alloc_bits &= ~extra_flags;
+       write_sequnlock(&fs_info->profiles_lock);
 }
 
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
@@ -8036,6 +8121,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        spin_lock(&root->fs_info->block_group_cache_lock);
        rb_erase(&block_group->cache_node,
                 &root->fs_info->block_group_cache_tree);
+
+       if (root->fs_info->first_logical_byte == block_group->key.objectid)
+               root->fs_info->first_logical_byte = (u64)-1;
        spin_unlock(&root->fs_info->block_group_cache_lock);
 
        down_write(&block_group->space_info->groups_sem);
@@ -8158,7 +8246,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
 
                if (end - start >= range->minlen) {
                        if (!block_group_cache_done(cache)) {
-                               ret = cache_block_group(cache, NULL, root, 0);
+                               ret = cache_block_group(cache, 0);
                                if (!ret)
                                        wait_block_group_cache_done(cache);
                        }
index 1b319df29eeee30904bdaa165c1a2fc2b5a49103..5c00d6aeae755a45f28f5a81fab6a4c863444a7d 100644 (file)
@@ -1834,7 +1834,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
  */
 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
 {
-       u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+       u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
        if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
                SetPageUptodate(page);
@@ -1846,7 +1846,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
  */
 static void check_page_locked(struct extent_io_tree *tree, struct page *page)
 {
-       u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+       u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
        if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
                unlock_page(page);
@@ -1960,7 +1960,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
                return -EIO;
        }
        bio->bi_bdev = dev->bdev;
-       bio_add_page(bio, page, length, start-page_offset(page));
+       bio_add_page(bio, page, length, start - page_offset(page));
        btrfsic_submit_bio(WRITE_SYNC, bio);
        wait_for_completion(&compl);
 
@@ -2293,8 +2293,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
                struct page *page = bvec->bv_page;
                tree = &BTRFS_I(page->mapping->host)->io_tree;
 
-               start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-                        bvec->bv_offset;
+               start = page_offset(page) + bvec->bv_offset;
                end = start + bvec->bv_len - 1;
 
                if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -2353,8 +2352,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                         (long int)bio->bi_bdev);
                tree = &BTRFS_I(page->mapping->host)->io_tree;
 
-               start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-                       bvec->bv_offset;
+               start = page_offset(page) + bvec->bv_offset;
                end = start + bvec->bv_len - 1;
 
                if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -2471,7 +2469,7 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
        struct extent_io_tree *tree = bio->bi_private;
        u64 start;
 
-       start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+       start = page_offset(page) + bvec->bv_offset;
 
        bio->bi_private = NULL;
 
@@ -2595,7 +2593,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                                   unsigned long *bio_flags)
 {
        struct inode *inode = page->mapping->host;
-       u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+       u64 start = page_offset(page);
        u64 page_end = start + PAGE_CACHE_SIZE - 1;
        u64 end;
        u64 cur = start;
@@ -2648,6 +2646,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                }
        }
        while (cur <= end) {
+               unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+
                if (cur >= last_byte) {
                        char *userpage;
                        struct extent_state *cached = NULL;
@@ -2735,26 +2735,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                        continue;
                }
 
-               ret = 0;
-               if (tree->ops && tree->ops->readpage_io_hook) {
-                       ret = tree->ops->readpage_io_hook(page, cur,
-                                                         cur + iosize - 1);
-               }
-               if (!ret) {
-                       unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
-                       pnr -= page->index;
-                       ret = submit_extent_page(READ, tree, page,
+               pnr -= page->index;
+               ret = submit_extent_page(READ, tree, page,
                                         sector, disk_io_size, pg_offset,
                                         bdev, bio, pnr,
                                         end_bio_extent_readpage, mirror_num,
                                         *bio_flags,
                                         this_bio_flag);
-                       if (!ret) {
-                               nr++;
-                               *bio_flags = this_bio_flag;
-                       }
-               }
-               if (ret) {
+               if (!ret) {
+                       nr++;
+                       *bio_flags = this_bio_flag;
+               } else {
                        SetPageError(page);
                        unlock_extent(tree, cur, cur + iosize - 1);
                }
@@ -2806,7 +2797,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        struct inode *inode = page->mapping->host;
        struct extent_page_data *epd = data;
        struct extent_io_tree *tree = epd->tree;
-       u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+       u64 start = page_offset(page);
        u64 delalloc_start;
        u64 page_end = start + PAGE_CACHE_SIZE - 1;
        u64 end;
@@ -3124,12 +3115,9 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
                set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
                spin_unlock(&eb->refs_lock);
                btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-               spin_lock(&fs_info->delalloc_lock);
-               if (fs_info->dirty_metadata_bytes >= eb->len)
-                       fs_info->dirty_metadata_bytes -= eb->len;
-               else
-                       WARN_ON(1);
-               spin_unlock(&fs_info->delalloc_lock);
+               __percpu_counter_add(&fs_info->dirty_metadata_bytes,
+                                    -eb->len,
+                                    fs_info->dirty_metadata_batch);
                ret = 1;
        } else {
                spin_unlock(&eb->refs_lock);
@@ -3446,15 +3434,9 @@ retry:
                         * swizzled back from swapper_space to tmpfs file
                         * mapping
                         */
-                       if (tree->ops &&
-                           tree->ops->write_cache_pages_lock_hook) {
-                               tree->ops->write_cache_pages_lock_hook(page,
-                                                              data, flush_fn);
-                       } else {
-                               if (!trylock_page(page)) {
-                                       flush_fn(data);
-                                       lock_page(page);
-                               }
+                       if (!trylock_page(page)) {
+                               flush_fn(data);
+                               lock_page(page);
                        }
 
                        if (unlikely(page->mapping != mapping)) {
@@ -3674,7 +3656,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
                          struct page *page, unsigned long offset)
 {
        struct extent_state *cached_state = NULL;
-       u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
+       u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
        size_t blocksize = page->mapping->host->i_sb->s_blocksize;
 
@@ -3700,7 +3682,7 @@ int try_release_extent_state(struct extent_map_tree *map,
                             struct extent_io_tree *tree, struct page *page,
                             gfp_t mask)
 {
-       u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+       u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
        int ret = 1;
 
@@ -3739,7 +3721,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
                               gfp_t mask)
 {
        struct extent_map *em;
-       u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+       u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
 
        if ((mask & __GFP_WAIT) &&
index 2eacfabd32632e90056e76cc5a678ecce6c6504a..ff182322d112bfbe8827710dec4ddf464670cda5 100644 (file)
@@ -75,7 +75,6 @@ struct extent_io_ops {
        int (*merge_bio_hook)(struct page *page, unsigned long offset,
                              size_t size, struct bio *bio,
                              unsigned long bio_flags);
-       int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
        int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
        int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
                                    struct extent_state *state, int mirror);
@@ -90,8 +89,6 @@ struct extent_io_ops {
                                  struct extent_state *other);
        void (*split_extent_hook)(struct inode *inode,
                                  struct extent_state *orig, u64 split);
-       int (*write_cache_pages_lock_hook)(struct page *page, void *data,
-                                          void (*flush_fn)(void *));
 };
 
 struct extent_io_tree {
index 94aa53b387213bbfc13181163368e5e168574d77..ec160202be3e38057c26498558e2f309af7859b7 100644 (file)
@@ -684,6 +684,24 @@ out:
        return ret;
 }
 
+static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums,
+                                struct btrfs_sector_sum *sector_sum,
+                                u64 total_bytes, u64 sectorsize)
+{
+       u64 tmp = sectorsize;
+       u64 next_sector = sector_sum->bytenr;
+       struct btrfs_sector_sum *next = sector_sum + 1;
+
+       while ((tmp + total_bytes) < sums->len) {
+               if (next_sector + sectorsize != next->bytenr)
+                       break;
+               tmp += sectorsize;
+               next_sector = next->bytenr;
+               next++;
+       }
+       return tmp;
+}
+
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct btrfs_ordered_sum *sums)
@@ -789,20 +807,32 @@ again:
                goto insert;
        }
 
-       if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
+       if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
            csum_size) {
-               u32 diff = (csum_offset + 1) * csum_size;
+               int extend_nr;
+               u64 tmp;
+               u32 diff;
+               u32 free_space;
 
-               /*
-                * is the item big enough already?  we dropped our lock
-                * before and need to recheck
-                */
-               if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
-                       goto csum;
+               if (btrfs_leaf_free_space(root, leaf) <
+                                sizeof(struct btrfs_item) + csum_size * 2)
+                       goto insert;
+
+               free_space = btrfs_leaf_free_space(root, leaf) -
+                                        sizeof(struct btrfs_item) - csum_size;
+               tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
+                                           root->sectorsize);
+               tmp >>= root->fs_info->sb->s_blocksize_bits;
+               WARN_ON(tmp < 1);
+
+               extend_nr = max_t(int, 1, (int)tmp);
+               diff = (csum_offset + extend_nr) * csum_size;
+               diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size);
 
                diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
-               if (diff != csum_size)
-                       goto insert;
+               diff = min(free_space, diff);
+               diff /= csum_size;
+               diff *= csum_size;
 
                btrfs_extend_item(trans, root, path, diff);
                goto csum;
@@ -812,19 +842,14 @@ insert:
        btrfs_release_path(path);
        csum_offset = 0;
        if (found_next) {
-               u64 tmp = total_bytes + root->sectorsize;
-               u64 next_sector = sector_sum->bytenr;
-               struct btrfs_sector_sum *next = sector_sum + 1;
+               u64 tmp;
 
-               while (tmp < sums->len) {
-                       if (next_sector + root->sectorsize != next->bytenr)
-                               break;
-                       tmp += root->sectorsize;
-                       next_sector = next->bytenr;
-                       next++;
-               }
-               tmp = min(tmp, next_offset - file_key.offset);
+               tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
+                                           root->sectorsize);
                tmp >>= root->fs_info->sb->s_blocksize_bits;
+               tmp = min(tmp, (next_offset - file_key.offset) >>
+                                        root->fs_info->sb->s_blocksize_bits);
+
                tmp = max((u64)1, tmp);
                tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
                ins_size = csum_size * tmp;
index aeb84469d2c4c0621b002084617578f7ac5f49b1..9f67e623206d90c7ef279a6291b116f5211007b4 100644 (file)
 #include <linux/statfs.h>
 #include <linux/compat.h>
 #include <linux/slab.h>
+#include <linux/btrfs.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "tree-log.h"
 #include "locking.h"
@@ -1544,7 +1544,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
         * although we have opened a file as writable, we have
         * to stop this write operation to ensure FS consistency.
         */
-       if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+       if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
                mutex_unlock(&inode->i_mutex);
                err = -EROFS;
                goto out;
@@ -1627,7 +1627,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
         */
        if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
                               &BTRFS_I(inode)->runtime_flags)) {
-               btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+               struct btrfs_trans_handle *trans;
+               struct btrfs_root *root = BTRFS_I(inode)->root;
+
+               /*
+                * We need to block on a committing transaction to keep us from
+                * throwing a ordered operation on to the list and causing
+                * something like sync to deadlock trying to flush out this
+                * inode.
+                */
+               trans = btrfs_start_transaction(root, 0);
+               if (IS_ERR(trans))
+                       return PTR_ERR(trans);
+               btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
+               btrfs_end_transaction(trans, root);
                if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
                        filemap_flush(inode->i_mapping);
        }
@@ -1654,16 +1667,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
        struct btrfs_trans_handle *trans;
+       bool full_sync = 0;
 
        trace_btrfs_sync_file(file, datasync);
 
        /*
         * We write the dirty pages in the range and wait until they complete
         * out of the ->i_mutex. If so, we can flush the dirty pages by
-        * multi-task, and make the performance up.
+        * multi-task, and make the performance up.  See
+        * btrfs_wait_ordered_range for an explanation of the ASYNC check.
         */
        atomic_inc(&BTRFS_I(inode)->sync_writers);
-       ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+       ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+       if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+                            &BTRFS_I(inode)->runtime_flags))
+               ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
        atomic_dec(&BTRFS_I(inode)->sync_writers);
        if (ret)
                return ret;
@@ -1675,7 +1693,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         * range being left.
         */
        atomic_inc(&root->log_batch);
-       btrfs_wait_ordered_range(inode, start, end - start + 1);
+       full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                            &BTRFS_I(inode)->runtime_flags);
+       if (full_sync)
+               btrfs_wait_ordered_range(inode, start, end - start + 1);
        atomic_inc(&root->log_batch);
 
        /*
@@ -1742,13 +1763,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
        if (ret != BTRFS_NO_LOG_SYNC) {
                if (ret > 0) {
+                       /*
+                        * If we didn't already wait for ordered extents we need
+                        * to do that now.
+                        */
+                       if (!full_sync)
+                               btrfs_wait_ordered_range(inode, start,
+                                                        end - start + 1);
                        ret = btrfs_commit_transaction(trans, root);
                } else {
                        ret = btrfs_sync_log(trans, root);
-                       if (ret == 0)
+                       if (ret == 0) {
                                ret = btrfs_end_transaction(trans, root);
-                       else
+                       } else {
+                               if (!full_sync)
+                                       btrfs_wait_ordered_range(inode, start,
+                                                                end -
+                                                                start + 1);
                                ret = btrfs_commit_transaction(trans, root);
+                       }
                }
        } else {
                ret = btrfs_end_transaction(trans, root);
index 0be7a8742a43bb502c9b9294661baaef6e5514e1..c8090f18c217e03bda557ef9f90efb21aa45f121 100644 (file)
@@ -1356,6 +1356,8 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
        u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
        int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
 
+       max_bitmaps = max(max_bitmaps, 1);
+
        BUG_ON(ctl->total_bitmaps > max_bitmaps);
 
        /*
@@ -1636,10 +1638,14 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
        }
 
        /*
-        * some block groups are so tiny they can't be enveloped by a bitmap, so
-        * don't even bother to create a bitmap for this
+        * The original block groups from mkfs can be really small, like 8
+        * megabytes, so don't bother with a bitmap for those entries.  However
+        * some block groups can be smaller than what a bitmap would cover but
+        * are still large enough that they could overflow the 32k memory limit,
+        * so allow those block groups to still be allowed to have a bitmap
+        * entry.
         */
-       if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
+       if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
                return false;
 
        return true;
index cc93b23ca3520c9ad7bf2919e77f61cf400c2e4c..1aa98be54ce02aefbc105ba5f8579d824a17aad6 100644 (file)
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/mount.h>
+#include <linux/btrfs.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "ordered-data.h"
 #include "xattr.h"
@@ -608,7 +608,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
        if (list_empty(&async_cow->extents))
                return 0;
 
-
+again:
        while (!list_empty(&async_cow->extents)) {
                async_extent = list_entry(async_cow->extents.next,
                                          struct async_extent, list);
@@ -648,6 +648,8 @@ retry:
                                                  async_extent->ram_size - 1,
                                                  btrfs_get_extent,
                                                  WB_SYNC_ALL);
+                       else if (ret)
+                               unlock_page(async_cow->locked_page);
                        kfree(async_extent);
                        cond_resched();
                        continue;
@@ -672,6 +674,7 @@ retry:
 
                if (ret) {
                        int i;
+
                        for (i = 0; i < async_extent->nr_pages; i++) {
                                WARN_ON(async_extent->pages[i]->mapping);
                                page_cache_release(async_extent->pages[i]);
@@ -679,12 +682,10 @@ retry:
                        kfree(async_extent->pages);
                        async_extent->nr_pages = 0;
                        async_extent->pages = NULL;
-                       unlock_extent(io_tree, async_extent->start,
-                                     async_extent->start +
-                                     async_extent->ram_size - 1);
+
                        if (ret == -ENOSPC)
                                goto retry;
-                       goto out_free; /* JDM: Requeue? */
+                       goto out_free;
                }
 
                /*
@@ -696,10 +697,13 @@ retry:
                                        async_extent->ram_size - 1, 0);
 
                em = alloc_extent_map();
-               BUG_ON(!em); /* -ENOMEM */
+               if (!em)
+                       goto out_free_reserve;
                em->start = async_extent->start;
                em->len = async_extent->ram_size;
                em->orig_start = em->start;
+               em->mod_start = em->start;
+               em->mod_len = em->len;
 
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
@@ -726,6 +730,9 @@ retry:
                                                async_extent->ram_size - 1, 0);
                }
 
+               if (ret)
+                       goto out_free_reserve;
+
                ret = btrfs_add_ordered_extent_compress(inode,
                                                async_extent->start,
                                                ins.objectid,
@@ -733,7 +740,8 @@ retry:
                                                ins.offset,
                                                BTRFS_ORDERED_COMPRESSED,
                                                async_extent->compress_type);
-               BUG_ON(ret); /* -ENOMEM */
+               if (ret)
+                       goto out_free_reserve;
 
                /*
                 * clear dirty, set writeback and unlock the pages.
@@ -754,18 +762,30 @@ retry:
                                    ins.objectid,
                                    ins.offset, async_extent->pages,
                                    async_extent->nr_pages);
-
-               BUG_ON(ret); /* -ENOMEM */
                alloc_hint = ins.objectid + ins.offset;
                kfree(async_extent);
+               if (ret)
+                       goto out;
                cond_resched();
        }
        ret = 0;
 out:
        return ret;
+out_free_reserve:
+       btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
 out_free:
+       extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+                                    async_extent->start,
+                                    async_extent->start +
+                                    async_extent->ram_size - 1,
+                                    NULL, EXTENT_CLEAR_UNLOCK_PAGE |
+                                    EXTENT_CLEAR_UNLOCK |
+                                    EXTENT_CLEAR_DELALLOC |
+                                    EXTENT_CLEAR_DIRTY |
+                                    EXTENT_SET_WRITEBACK |
+                                    EXTENT_END_WRITEBACK);
        kfree(async_extent);
-       goto out;
+       goto again;
 }
 
 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
@@ -892,6 +912,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
                em->orig_start = em->start;
                ram_size = ins.offset;
                em->len = ins.offset;
+               em->mod_start = em->start;
+               em->mod_len = em->len;
 
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
@@ -1338,6 +1360,8 @@ out_check:
                        em->block_start = disk_bytenr;
                        em->orig_block_len = disk_num_bytes;
                        em->bdev = root->fs_info->fs_devices->latest_bdev;
+                       em->mod_start = em->start;
+                       em->mod_len = em->len;
                        set_bit(EXTENT_FLAG_PINNED, &em->flags);
                        set_bit(EXTENT_FLAG_FILLING, &em->flags);
                        em->generation = -1;
@@ -1508,14 +1532,22 @@ static void btrfs_set_bit_hook(struct inode *inode,
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
 
-               spin_lock(&root->fs_info->delalloc_lock);
+               __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
+                                    root->fs_info->delalloc_batch);
+               spin_lock(&BTRFS_I(inode)->lock);
                BTRFS_I(inode)->delalloc_bytes += len;
-               root->fs_info->delalloc_bytes += len;
-               if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-                       list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
-                                     &root->fs_info->delalloc_inodes);
+               if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                                        &BTRFS_I(inode)->runtime_flags)) {
+                       spin_lock(&root->fs_info->delalloc_lock);
+                       if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+                               list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+                                             &root->fs_info->delalloc_inodes);
+                               set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                                       &BTRFS_I(inode)->runtime_flags);
+                       }
+                       spin_unlock(&root->fs_info->delalloc_lock);
                }
-               spin_unlock(&root->fs_info->delalloc_lock);
+               spin_unlock(&BTRFS_I(inode)->lock);
        }
 }
 
@@ -1550,15 +1582,22 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                    && do_list)
                        btrfs_free_reserved_data_space(inode, len);
 
-               spin_lock(&root->fs_info->delalloc_lock);
-               root->fs_info->delalloc_bytes -= len;
+               __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
+                                    root->fs_info->delalloc_batch);
+               spin_lock(&BTRFS_I(inode)->lock);
                BTRFS_I(inode)->delalloc_bytes -= len;
-
                if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
-                   !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-                       list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+                   test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                            &BTRFS_I(inode)->runtime_flags)) {
+                       spin_lock(&root->fs_info->delalloc_lock);
+                       if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+                               list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+                               clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                                         &BTRFS_I(inode)->runtime_flags);
+                       }
+                       spin_unlock(&root->fs_info->delalloc_lock);
                }
-               spin_unlock(&root->fs_info->delalloc_lock);
+               spin_unlock(&BTRFS_I(inode)->lock);
        }
 }
 
@@ -2001,11 +2040,23 @@ out:
        if (trans)
                btrfs_end_transaction(trans, root);
 
-       if (ret)
+       if (ret) {
                clear_extent_uptodate(io_tree, ordered_extent->file_offset,
                                      ordered_extent->file_offset +
                                      ordered_extent->len - 1, NULL, GFP_NOFS);
 
+               /*
+                * If the ordered extent had an IOERR or something else went
+                * wrong we need to return the space for this ordered extent
+                * back to the allocator.
+                */
+               if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
+                   !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
+                       btrfs_free_reserved_extent(root, ordered_extent->start,
+                                                  ordered_extent->disk_len);
+       }
+
+
        /*
         * This needs to be done to make sure anybody waiting knows we are done
         * updating everything for this ordered extent.
@@ -2062,7 +2113,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                               struct extent_state *state, int mirror)
 {
-       size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
+       size_t offset = start - page_offset(page);
        struct inode *inode = page->mapping->host;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        char *kaddr;
@@ -2167,11 +2218,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
        }
 }
 
-enum btrfs_orphan_cleanup_state {
-       ORPHAN_CLEANUP_STARTED  = 1,
-       ORPHAN_CLEANUP_DONE     = 2,
-};
-
 /*
  * This is called in transaction commit time. If there are no orphan
  * files in the subvolume, it removes orphan item and frees block_rsv
@@ -2469,6 +2515,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                 */
                set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
                        &BTRFS_I(inode)->runtime_flags);
+               atomic_inc(&root->orphan_inodes);
 
                /* if we have links, this was a truncate, lets do that */
                if (inode->i_nlink) {
@@ -2491,6 +2538,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                                goto out;
 
                        ret = btrfs_truncate(inode);
+                       if (ret)
+                               btrfs_orphan_del(NULL, inode);
                } else {
                        nr_unlink++;
                }
@@ -2709,34 +2758,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                            struct btrfs_inode_item *item,
                            struct inode *inode)
 {
-       btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
-       btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
-       btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
-       btrfs_set_inode_mode(leaf, item, inode->i_mode);
-       btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+       struct btrfs_map_token token;
+
+       btrfs_init_map_token(&token);
 
-       btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
-                              inode->i_atime.tv_sec);
-       btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
-                               inode->i_atime.tv_nsec);
+       btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
+       btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
+       btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
+                                  &token);
+       btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+       btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
 
-       btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
-                              inode->i_mtime.tv_sec);
-       btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
-                               inode->i_mtime.tv_nsec);
+       btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+                                    inode->i_atime.tv_sec, &token);
+       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+                                     inode->i_atime.tv_nsec, &token);
 
-       btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
-                              inode->i_ctime.tv_sec);
-       btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
-                               inode->i_ctime.tv_nsec);
+       btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+                                    inode->i_mtime.tv_sec, &token);
+       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+                                     inode->i_mtime.tv_nsec, &token);
 
-       btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
-       btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
-       btrfs_set_inode_sequence(leaf, item, inode->i_version);
-       btrfs_set_inode_transid(leaf, item, trans->transid);
-       btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
-       btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
-       btrfs_set_inode_block_group(leaf, item, 0);
+       btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+                                    inode->i_ctime.tv_sec, &token);
+       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+                                     inode->i_ctime.tv_nsec, &token);
+
+       btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+                                    &token);
+       btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
+                                        &token);
+       btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+       btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+       btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+       btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+       btrfs_set_token_inode_block_group(leaf, item, 0, &token);
 }
 
 /*
@@ -3832,6 +3888,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 
                /* we don't support swapfiles, so vmtruncate shouldn't fail */
                truncate_setsize(inode, newsize);
+
+               /* Disable nonlocked read DIO to avoid the end less truncate */
+               btrfs_inode_block_unlocked_dio(inode);
+               inode_dio_wait(inode);
+               btrfs_inode_resume_unlocked_dio(inode);
+
                ret = btrfs_truncate(inode);
                if (ret && inode->i_nlink)
                        btrfs_orphan_del(NULL, inode);
@@ -3904,6 +3966,12 @@ void btrfs_evict_inode(struct inode *inode)
                goto no_delete;
        }
 
+       ret = btrfs_commit_inode_delayed_inode(inode);
+       if (ret) {
+               btrfs_orphan_del(NULL, inode);
+               goto no_delete;
+       }
+
        rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
        if (!rsv) {
                btrfs_orphan_del(NULL, inode);
@@ -3941,7 +4009,7 @@ void btrfs_evict_inode(struct inode *inode)
                        goto no_delete;
                }
 
-               trans = btrfs_start_transaction_lflush(root, 1);
+               trans = btrfs_join_transaction(root);
                if (IS_ERR(trans)) {
                        btrfs_orphan_del(NULL, inode);
                        btrfs_free_block_rsv(root, rsv);
@@ -3955,9 +4023,6 @@ void btrfs_evict_inode(struct inode *inode)
                        break;
 
                trans->block_rsv = &root->fs_info->trans_block_rsv;
-               ret = btrfs_update_inode(trans, root, inode);
-               BUG_ON(ret);
-
                btrfs_end_transaction(trans, root);
                trans = NULL;
                btrfs_btree_balance_dirty(root);
@@ -5006,12 +5071,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
 
-       err = btrfs_update_inode(trans, root, inode);
-       if (err) {
-               drop_inode = 1;
-               goto out_unlock;
-       }
-
        /*
        * If the active LSM wants to access the inode during
        * d_instantiate it needs these. Smack checks to see
@@ -5949,6 +6008,8 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 
        em->start = start;
        em->orig_start = orig_start;
+       em->mod_start = start;
+       em->mod_len = len;
        em->len = len;
        em->block_len = block_len;
        em->block_start = block_start;
@@ -5990,16 +6051,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        u64 len = bh_result->b_size;
        struct btrfs_trans_handle *trans;
        int unlock_bits = EXTENT_LOCKED;
-       int ret;
+       int ret = 0;
 
        if (create) {
-               ret = btrfs_delalloc_reserve_space(inode, len);
-               if (ret)
-                       return ret;
+               spin_lock(&BTRFS_I(inode)->lock);
+               BTRFS_I(inode)->outstanding_extents++;
+               spin_unlock(&BTRFS_I(inode)->lock);
                unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
-       } else {
+       } else
                len = min_t(u64, len, root->sectorsize);
-       }
 
        lockstart = start;
        lockend = start + len - 1;
@@ -6011,14 +6071,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
                return -ENOTBLK;
 
-       if (create) {
-               ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-                                    lockend, EXTENT_DELALLOC, NULL,
-                                    &cached_state, GFP_NOFS);
-               if (ret)
-                       goto unlock_err;
-       }
-
        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
        if (IS_ERR(em)) {
                ret = PTR_ERR(em);
@@ -6050,7 +6102,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        if (!create && (em->block_start == EXTENT_MAP_HOLE ||
                        test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
                free_extent_map(em);
-               ret = 0;
                goto unlock_err;
        }
 
@@ -6148,6 +6199,11 @@ unlock:
                 */
                if (start + len > i_size_read(inode))
                        i_size_write(inode, start + len);
+
+               ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                    lockstart + len - 1, EXTENT_DELALLOC, NULL,
+                                    &cached_state, GFP_NOFS);
+               BUG_ON(ret);
        }
 
        /*
@@ -6156,24 +6212,9 @@ unlock:
         * aren't using if there is any left over space.
         */
        if (lockstart < lockend) {
-               if (create && len < lockend - lockstart) {
-                       clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-                                        lockstart + len - 1,
-                                        unlock_bits | EXTENT_DEFRAG, 1, 0,
-                                        &cached_state, GFP_NOFS);
-                       /*
-                        * Beside unlock, we also need to cleanup reserved space
-                        * for the left range by attaching EXTENT_DO_ACCOUNTING.
-                        */
-                       clear_extent_bit(&BTRFS_I(inode)->io_tree,
-                                        lockstart + len, lockend,
-                                        unlock_bits | EXTENT_DO_ACCOUNTING |
-                                        EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
-               } else {
-                       clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-                                        lockend, unlock_bits, 1, 0,
-                                        &cached_state, GFP_NOFS);
-               }
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                lockend, unlock_bits, 1, 0,
+                                &cached_state, GFP_NOFS);
        } else {
                free_extent_state(cached_state);
        }
@@ -6183,9 +6224,6 @@ unlock:
        return 0;
 
 unlock_err:
-       if (create)
-               unlock_bits |= EXTENT_DO_ACCOUNTING;
-
        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                         unlock_bits, 1, 0, &cached_state, GFP_NOFS);
        return ret;
@@ -6623,15 +6661,63 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+       size_t count = 0;
+       int flags = 0;
+       bool wakeup = true;
+       bool relock = false;
+       ssize_t ret;
 
        if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
                            offset, nr_segs))
                return 0;
 
-       return __blockdev_direct_IO(rw, iocb, inode,
-                  BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
-                  iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
-                  btrfs_submit_direct, 0);
+       atomic_inc(&inode->i_dio_count);
+       smp_mb__after_atomic_inc();
+
+       if (rw & WRITE) {
+               count = iov_length(iov, nr_segs);
+               /*
+                * If the write DIO is beyond the EOF, we need update
+                * the isize, but it is protected by i_mutex. So we can
+                * not unlock the i_mutex at this case.
+                */
+               if (offset + count <= inode->i_size) {
+                       mutex_unlock(&inode->i_mutex);
+                       relock = true;
+               }
+               ret = btrfs_delalloc_reserve_space(inode, count);
+               if (ret)
+                       goto out;
+       } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+                                    &BTRFS_I(inode)->runtime_flags))) {
+               inode_dio_done(inode);
+               flags = DIO_LOCKING | DIO_SKIP_HOLES;
+               wakeup = false;
+       }
+
+       ret = __blockdev_direct_IO(rw, iocb, inode,
+                       BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+                       iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+                       btrfs_submit_direct, flags);
+       if (rw & WRITE) {
+               if (ret < 0 && ret != -EIOCBQUEUED)
+                       btrfs_delalloc_release_space(inode, count);
+               else if (ret > 0 && (size_t)ret < count) {
+                       spin_lock(&BTRFS_I(inode)->lock);
+                       BTRFS_I(inode)->outstanding_extents++;
+                       spin_unlock(&BTRFS_I(inode)->lock);
+                       btrfs_delalloc_release_space(inode,
+                                                    count - (size_t)ret);
+               }
+               btrfs_delalloc_release_metadata(inode, 0);
+       }
+out:
+       if (wakeup)
+               inode_dio_done(inode);
+       if (relock)
+               mutex_lock(&inode->i_mutex);
+
+       return ret;
 }
 
 #define BTRFS_FIEMAP_FLAGS     (FIEMAP_FLAG_SYNC)
@@ -6735,8 +6821,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
                return;
        }
        lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
-       ordered = btrfs_lookup_ordered_extent(inode,
-                                          page_offset(page));
+       ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
        if (ordered) {
                /*
                 * IO on this page will never be started, so we need
@@ -7216,8 +7301,9 @@ int btrfs_drop_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
 
+       /* the snap/subvol tree is on deleting */
        if (btrfs_root_refs(&root->root_item) == 0 &&
-           !btrfs_is_free_space_inode(inode))
+           root != root->fs_info->tree_root)
                return 1;
        else
                return generic_drop_inode(inode);
@@ -7299,14 +7385,19 @@ fail:
 static int btrfs_getattr(struct vfsmount *mnt,
                         struct dentry *dentry, struct kstat *stat)
 {
+       u64 delalloc_bytes;
        struct inode *inode = dentry->d_inode;
        u32 blocksize = inode->i_sb->s_blocksize;
 
        generic_fillattr(inode, stat);
        stat->dev = BTRFS_I(inode)->root->anon_dev;
        stat->blksize = PAGE_CACHE_SIZE;
+
+       spin_lock(&BTRFS_I(inode)->lock);
+       delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
+       spin_unlock(&BTRFS_I(inode)->lock);
        stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
-               ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
+                       ALIGN(delalloc_bytes, blocksize)) >> 9;
        return 0;
 }
 
@@ -7583,7 +7674,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 
        INIT_LIST_HEAD(&works);
        INIT_LIST_HEAD(&splice);
-again:
+
        spin_lock(&root->fs_info->delalloc_lock);
        list_splice_init(&root->fs_info->delalloc_inodes, &splice);
        while (!list_empty(&splice)) {
@@ -7593,8 +7684,11 @@ again:
                list_del_init(&binode->delalloc_inodes);
 
                inode = igrab(&binode->vfs_inode);
-               if (!inode)
+               if (!inode) {
+                       clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                                 &binode->runtime_flags);
                        continue;
+               }
 
                list_add_tail(&binode->delalloc_inodes,
                              &root->fs_info->delalloc_inodes);
@@ -7619,13 +7713,6 @@ again:
                btrfs_wait_and_free_delalloc_work(work);
        }
 
-       spin_lock(&root->fs_info->delalloc_lock);
-       if (!list_empty(&root->fs_info->delalloc_inodes)) {
-               spin_unlock(&root->fs_info->delalloc_lock);
-               goto again;
-       }
-       spin_unlock(&root->fs_info->delalloc_lock);
-
        /* the filemap_flush will queue IO into the worker threads, but
         * we have to make sure the IO is actually started and that
         * ordered extents get created before we return
index 338f2597bf7f8da2215e0d87289b01c94f9cfe7c..059546aa8fdf963c46c9ff188237bdf947d0ea06 100644 (file)
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/uuid.h>
+#include <linux/btrfs.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "volumes.h"
 #include "locking.h"
@@ -367,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root,
                                  struct dentry *dentry,
                                  char *name, int namelen,
                                  u64 *async_transid,
-                                 struct btrfs_qgroup_inherit **inherit)
+                                 struct btrfs_qgroup_inherit *inherit)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_key key;
@@ -401,8 +401,7 @@ static noinline int create_subvol(struct btrfs_root *root,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
-       ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid,
-                                  inherit ? *inherit : NULL);
+       ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit);
        if (ret)
                goto fail;
 
@@ -533,7 +532,7 @@ fail:
 
 static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
                           char *name, int namelen, u64 *async_transid,
-                          bool readonly, struct btrfs_qgroup_inherit **inherit)
+                          bool readonly, struct btrfs_qgroup_inherit *inherit)
 {
        struct inode *inode;
        struct btrfs_pending_snapshot *pending_snapshot;
@@ -552,10 +551,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        pending_snapshot->dentry = dentry;
        pending_snapshot->root = root;
        pending_snapshot->readonly = readonly;
-       if (inherit) {
-               pending_snapshot->inherit = *inherit;
-               *inherit = NULL;        /* take responsibility to free it */
-       }
+       pending_snapshot->inherit = inherit;
 
        trans = btrfs_start_transaction(root->fs_info->extent_root, 6);
        if (IS_ERR(trans)) {
@@ -695,7 +691,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
                                   char *name, int namelen,
                                   struct btrfs_root *snap_src,
                                   u64 *async_transid, bool readonly,
-                                  struct btrfs_qgroup_inherit **inherit)
+                                  struct btrfs_qgroup_inherit *inherit)
 {
        struct inode *dir  = parent->dentry->d_inode;
        struct dentry *dentry;
@@ -818,7 +814,7 @@ static int find_new_extents(struct btrfs_root *root,
 
        while(1) {
                ret = btrfs_search_forward(root, &min_key, &max_key,
-                                          path, 0, newer_than);
+                                          path, newer_than);
                if (ret != 0)
                        goto none;
                if (min_key.objectid != ino)
@@ -1206,6 +1202,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                if (!(inode->i_sb->s_flags & MS_ACTIVE))
                        break;
 
+               if (btrfs_defrag_cancelled(root->fs_info)) {
+                       printk(KERN_DEBUG "btrfs: defrag_file cancelled\n");
+                       ret = -EAGAIN;
+                       break;
+               }
+
                if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
                                         extent_thresh, &last_len, &skip,
                                         &defrag_end, range->flags &
@@ -1329,9 +1331,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
        int ret = 0;
        int mod = 0;
 
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
-
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
@@ -1363,6 +1362,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                *devstr = '\0';
                devstr = vol_args->name;
                devid = simple_strtoull(devstr, &end, 10);
+               if (!devid) {
+                       ret = -EINVAL;
+                       goto out_free;
+               }
                printk(KERN_INFO "btrfs: resizing devid %llu\n",
                       (unsigned long long)devid);
        }
@@ -1371,7 +1374,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
        if (!device) {
                printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
                       (unsigned long long)devid);
-               ret = -EINVAL;
+               ret = -ENODEV;
                goto out_free;
        }
 
@@ -1379,7 +1382,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                printk(KERN_INFO "btrfs: resizer unable to apply on "
                       "readonly device %llu\n",
                       (unsigned long long)devid);
-               ret = -EINVAL;
+               ret = -EPERM;
                goto out_free;
        }
 
@@ -1401,7 +1404,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
        }
 
        if (device->is_tgtdev_for_dev_replace) {
-               ret = -EINVAL;
+               ret = -EPERM;
                goto out_free;
        }
 
@@ -1457,7 +1460,7 @@ out:
 static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
                                char *name, unsigned long fd, int subvol,
                                u64 *transid, bool readonly,
-                               struct btrfs_qgroup_inherit **inherit)
+                               struct btrfs_qgroup_inherit *inherit)
 {
        int namelen;
        int ret = 0;
@@ -1566,7 +1569,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 
        ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
                                              vol_args->fd, subvol, ptr,
-                                             readonly, &inherit);
+                                             readonly, inherit);
 
        if (ret == 0 && ptr &&
            copy_to_user(arg +
@@ -1863,7 +1866,7 @@ static noinline int search_ioctl(struct inode *inode,
        path->keep_locks = 1;
 
        while(1) {
-               ret = btrfs_search_forward(root, &key, &max_key, path, 0,
+               ret = btrfs_search_forward(root, &key, &max_key, path,
                                           sk->min_transid);
                if (ret != 0) {
                        if (ret > 0)
@@ -2171,6 +2174,12 @@ out_unlock:
                shrink_dcache_sb(root->fs_info->sb);
                btrfs_invalidate_inodes(dest);
                d_delete(dentry);
+
+               /* the last ref */
+               if (dest->cache_inode) {
+                       iput(dest->cache_inode);
+                       dest->cache_inode = NULL;
+               }
        }
 out_dput:
        dput(dentry);
@@ -2211,10 +2220,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                        ret = -EPERM;
                        goto out;
                }
-               ret = btrfs_defrag_root(root, 0);
+               ret = btrfs_defrag_root(root);
                if (ret)
                        goto out;
-               ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
+               ret = btrfs_defrag_root(root->fs_info->extent_root);
                break;
        case S_IFREG:
                if (!(file->f_mode & FMODE_WRITE)) {
@@ -3111,7 +3120,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
        u64 transid;
        int ret;
 
-       trans = btrfs_attach_transaction(root);
+       trans = btrfs_attach_transaction_barrier(root);
        if (IS_ERR(trans)) {
                if (PTR_ERR(trans) != -ENOENT)
                        return PTR_ERR(trans);
@@ -3289,7 +3298,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
        struct inode_fs_paths *ipath = NULL;
        struct btrfs_path *path;
 
-       if (!capable(CAP_SYS_ADMIN))
+       if (!capable(CAP_DAC_READ_SEARCH))
                return -EPERM;
 
        path = btrfs_alloc_path();
@@ -3914,6 +3923,65 @@ out:
        return ret;
 }
 
+static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
+{
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+       const char *label = root->fs_info->super_copy->label;
+       size_t len = strnlen(label, BTRFS_LABEL_SIZE);
+       int ret;
+
+       if (len == BTRFS_LABEL_SIZE) {
+               pr_warn("btrfs: label is too long, return the first %zu bytes\n",
+                       --len);
+       }
+
+       mutex_lock(&root->fs_info->volume_mutex);
+       ret = copy_to_user(arg, label, len);
+       mutex_unlock(&root->fs_info->volume_mutex);
+
+       return ret ? -EFAULT : 0;
+}
+
+static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
+{
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+       struct btrfs_super_block *super_block = root->fs_info->super_copy;
+       struct btrfs_trans_handle *trans;
+       char label[BTRFS_LABEL_SIZE];
+       int ret;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (copy_from_user(label, arg, sizeof(label)))
+               return -EFAULT;
+
+       if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
+               pr_err("btrfs: unable to set label with more than %d bytes\n",
+                      BTRFS_LABEL_SIZE - 1);
+               return -EINVAL;
+       }
+
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
+
+       mutex_lock(&root->fs_info->volume_mutex);
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out_unlock;
+       }
+
+       strcpy(super_block->label, label);
+       ret = btrfs_end_transaction(trans, root);
+
+out_unlock:
+       mutex_unlock(&root->fs_info->volume_mutex);
+       mnt_drop_write_file(file);
+       return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
                cmd, unsigned long arg)
 {
@@ -4014,6 +4082,10 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_qgroup_limit(file, argp);
        case BTRFS_IOC_DEV_REPLACE:
                return btrfs_ioctl_dev_replace(root, argp);
+       case BTRFS_IOC_GET_FSLABEL:
+               return btrfs_ioctl_get_fslabel(file, argp);
+       case BTRFS_IOC_SET_FSLABEL:
+               return btrfs_ioctl_set_fslabel(file, argp);
        }
 
        return -ENOTTY;
index 2a1762c660416c662d32f95060046ba2557ab903..e95df435d8976606730bb04faa3e7ceb2a87917c 100644 (file)
@@ -113,11 +113,10 @@ again:
                read_unlock(&eb->lock);
                return;
        }
-       read_unlock(&eb->lock);
-       wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
-       read_lock(&eb->lock);
        if (atomic_read(&eb->blocking_writers)) {
                read_unlock(&eb->lock);
+               wait_event(eb->write_lock_wq,
+                          atomic_read(&eb->blocking_writers) == 0);
                goto again;
        }
        atomic_inc(&eb->read_locks);
index e5ed56729607a82246cac22a229d105efa562509..dc08d77b717ea47f0eb7d43e351153f556c75eaa 100644 (file)
@@ -196,6 +196,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        entry->file_offset = file_offset;
        entry->start = start;
        entry->len = len;
+       if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
+           !(type == BTRFS_ORDERED_NOCOW))
+               entry->csum_bytes_left = disk_len;
        entry->disk_len = disk_len;
        entry->bytes_left = len;
        entry->inode = igrab(inode);
@@ -213,6 +216,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        INIT_LIST_HEAD(&entry->root_extent_list);
        INIT_LIST_HEAD(&entry->work_list);
        init_completion(&entry->completion);
+       INIT_LIST_HEAD(&entry->log_list);
 
        trace_btrfs_ordered_extent_add(inode, entry);
 
@@ -270,6 +274,10 @@ void btrfs_add_ordered_sum(struct inode *inode,
        tree = &BTRFS_I(inode)->ordered_tree;
        spin_lock_irq(&tree->lock);
        list_add_tail(&sum->list, &entry->list);
+       WARN_ON(entry->csum_bytes_left < sum->len);
+       entry->csum_bytes_left -= sum->len;
+       if (entry->csum_bytes_left == 0)
+               wake_up(&entry->wait);
        spin_unlock_irq(&tree->lock);
 }
 
@@ -405,6 +413,66 @@ out:
        return ret == 0;
 }
 
+/* Needs to either be called under a log transaction or the log_mutex */
+void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
+{
+       struct btrfs_ordered_inode_tree *tree;
+       struct btrfs_ordered_extent *ordered;
+       struct rb_node *n;
+       int index = log->log_transid % 2;
+
+       tree = &BTRFS_I(inode)->ordered_tree;
+       spin_lock_irq(&tree->lock);
+       for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+               ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+               spin_lock(&log->log_extents_lock[index]);
+               if (list_empty(&ordered->log_list)) {
+                       list_add_tail(&ordered->log_list, &log->logged_list[index]);
+                       atomic_inc(&ordered->refs);
+               }
+               spin_unlock(&log->log_extents_lock[index]);
+       }
+       spin_unlock_irq(&tree->lock);
+}
+
+void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
+{
+       struct btrfs_ordered_extent *ordered;
+       int index = transid % 2;
+
+       spin_lock_irq(&log->log_extents_lock[index]);
+       while (!list_empty(&log->logged_list[index])) {
+               ordered = list_first_entry(&log->logged_list[index],
+                                          struct btrfs_ordered_extent,
+                                          log_list);
+               list_del_init(&ordered->log_list);
+               spin_unlock_irq(&log->log_extents_lock[index]);
+               wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
+                                                  &ordered->flags));
+               btrfs_put_ordered_extent(ordered);
+               spin_lock_irq(&log->log_extents_lock[index]);
+       }
+       spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
+{
+       struct btrfs_ordered_extent *ordered;
+       int index = transid % 2;
+
+       spin_lock_irq(&log->log_extents_lock[index]);
+       while (!list_empty(&log->logged_list[index])) {
+               ordered = list_first_entry(&log->logged_list[index],
+                                          struct btrfs_ordered_extent,
+                                          log_list);
+               list_del_init(&ordered->log_list);
+               spin_unlock_irq(&log->log_extents_lock[index]);
+               btrfs_put_ordered_extent(ordered);
+               spin_lock_irq(&log->log_extents_lock[index]);
+       }
+       spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
 /*
  * used to drop a reference on an ordered extent.  This will free
  * the extent if the last reference is dropped
@@ -544,10 +612,12 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
  * extra check to make sure the ordered operation list really is empty
  * before we return
  */
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, int wait)
 {
        struct btrfs_inode *btrfs_inode;
        struct inode *inode;
+       struct btrfs_transaction *cur_trans = trans->transaction;
        struct list_head splice;
        struct list_head works;
        struct btrfs_delalloc_work *work, *next;
@@ -558,14 +628,10 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
 
        mutex_lock(&root->fs_info->ordered_operations_mutex);
        spin_lock(&root->fs_info->ordered_extent_lock);
-again:
-       list_splice_init(&root->fs_info->ordered_operations, &splice);
-
+       list_splice_init(&cur_trans->ordered_operations, &splice);
        while (!list_empty(&splice)) {
-
                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
                                   ordered_operations);
-
                inode = &btrfs_inode->vfs_inode;
 
                list_del_init(&btrfs_inode->ordered_operations);
@@ -574,24 +640,22 @@ again:
                 * the inode may be getting freed (in sys_unlink path).
                 */
                inode = igrab(inode);
-
-               if (!wait && inode) {
-                       list_add_tail(&BTRFS_I(inode)->ordered_operations,
-                             &root->fs_info->ordered_operations);
-               }
-
                if (!inode)
                        continue;
+
+               if (!wait)
+                       list_add_tail(&BTRFS_I(inode)->ordered_operations,
+                                     &cur_trans->ordered_operations);
                spin_unlock(&root->fs_info->ordered_extent_lock);
 
                work = btrfs_alloc_delalloc_work(inode, wait, 1);
                if (!work) {
+                       spin_lock(&root->fs_info->ordered_extent_lock);
                        if (list_empty(&BTRFS_I(inode)->ordered_operations))
                                list_add_tail(&btrfs_inode->ordered_operations,
                                              &splice);
-                       spin_lock(&root->fs_info->ordered_extent_lock);
                        list_splice_tail(&splice,
-                                        &root->fs_info->ordered_operations);
+                                        &cur_trans->ordered_operations);
                        spin_unlock(&root->fs_info->ordered_extent_lock);
                        ret = -ENOMEM;
                        goto out;
@@ -603,9 +667,6 @@ again:
                cond_resched();
                spin_lock(&root->fs_info->ordered_extent_lock);
        }
-       if (wait && !list_empty(&root->fs_info->ordered_operations))
-               goto again;
-
        spin_unlock(&root->fs_info->ordered_extent_lock);
 out:
        list_for_each_entry_safe(work, next, &works, list) {
@@ -974,6 +1035,7 @@ out:
 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root, struct inode *inode)
 {
+       struct btrfs_transaction *cur_trans = trans->transaction;
        u64 last_mod;
 
        last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
@@ -988,7 +1050,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
        spin_lock(&root->fs_info->ordered_extent_lock);
        if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
                list_add_tail(&BTRFS_I(inode)->ordered_operations,
-                             &root->fs_info->ordered_operations);
+                             &cur_trans->ordered_operations);
        }
        spin_unlock(&root->fs_info->ordered_extent_lock);
 }
index f29d4bf5fbe70dee3990874be8d9975c937f3477..8eadfe406cdd345263fb0bcf60841cd6801a89bb 100644 (file)
@@ -79,6 +79,8 @@ struct btrfs_ordered_sum {
 #define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
                                       * has done its due diligence in updating
                                       * the isize. */
+#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered
+                                      ordered extent */
 
 struct btrfs_ordered_extent {
        /* logical offset in the file */
@@ -96,6 +98,9 @@ struct btrfs_ordered_extent {
        /* number of bytes that still need writing */
        u64 bytes_left;
 
+       /* number of bytes that still need csumming */
+       u64 csum_bytes_left;
+
        /*
         * the end of the ordered extent which is behind it but
         * didn't update disk_i_size. Please see the comment of
@@ -118,6 +123,9 @@ struct btrfs_ordered_extent {
        /* list of checksums for insertion when the extent io is done */
        struct list_head list;
 
+       /* If we need to wait on this to be done */
+       struct list_head log_list;
+
        /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
        wait_queue_head_t wait;
 
@@ -189,11 +197,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, int wait);
 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct inode *inode);
 void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
+void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
+void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
 int __init ordered_data_init(void);
 void ordered_data_exit(void);
 #endif
index 50d95fd190a5932e165e9331313ca3e302635b5d..920957ecb27ee3761f9ee59f006a880884a3b882 100644 (file)
@@ -294,6 +294,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
                               btrfs_dev_extent_chunk_offset(l, dev_extent),
                               (unsigned long long)
                               btrfs_dev_extent_length(l, dev_extent));
+                       break;
                case BTRFS_DEV_STATS_KEY:
                        printk(KERN_INFO "\t\tdevice stats\n");
                        break;
index a5c856234323241c22dde18e1ee072c8bed3ccaa..88ab785bbd73181e64843d97838799619e1c140e 100644 (file)
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
+#include <linux/btrfs.h>
 
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
 #include "locking.h"
 #include "ulist.h"
-#include "ioctl.h"
 #include "backref.h"
 
 /* TODO XXX FIXME
@@ -847,6 +847,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
        int ret = 0;
 
        spin_lock(&fs_info->qgroup_lock);
+       if (!fs_info->quota_root) {
+               spin_unlock(&fs_info->qgroup_lock);
+               return 0;
+       }
        fs_info->quota_enabled = 0;
        fs_info->pending_quota_state = 0;
        quota_root = fs_info->quota_root;
index 300e09ac36599ae8b412284b43e677792005fe9e..ba5a3210da9a9ce17cd047d9a532cf676ff83125 100644 (file)
@@ -3017,7 +3017,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
                        }
                }
 
-               page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+               page_start = page_offset(page);
                page_end = page_start + PAGE_CACHE_SIZE - 1;
 
                lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
index 67783e03d1211bdcba1e84a20aa56a528916c370..c78b2a3fc3359e5625ff703f32f4fcc1cb3ae792 100644 (file)
@@ -2708,7 +2708,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
        int     ret;
        struct btrfs_root *root = sctx->dev_root;
 
-       if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+       if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
                return -EIO;
 
        gen = root->fs_info->last_trans_committed;
index 321b7fb4e4417573e9c404069c6b249afb8ab488..68da757615ae55b8b9eaec3a68d6ec346e856761 100644 (file)
@@ -85,6 +85,7 @@ struct send_ctx {
        u32 send_max_size;
        u64 total_send_size;
        u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
+       u64 flags;      /* 'flags' member of btrfs_ioctl_send_args is u64 */
 
        struct vfsmount *mnt;
 
@@ -3709,6 +3710,39 @@ out:
        return ret;
 }
 
+/*
+ * Send an update extent command to user space.
+ */
+static int send_update_extent(struct send_ctx *sctx,
+                             u64 offset, u32 len)
+{
+       int ret = 0;
+       struct fs_path *p;
+
+       p = fs_path_alloc(sctx);
+       if (!p)
+               return -ENOMEM;
+
+       ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
+       if (ret < 0)
+               goto out;
+
+       ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+       if (ret < 0)
+               goto out;
+
+       TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+       TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+       TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
+
+       ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+       fs_path_free(sctx, p);
+       return ret;
+}
+
 static int send_write_or_clone(struct send_ctx *sctx,
                               struct btrfs_path *path,
                               struct btrfs_key *key,
@@ -3744,7 +3778,11 @@ static int send_write_or_clone(struct send_ctx *sctx,
                goto out;
        }
 
-       if (!clone_root) {
+       if (clone_root) {
+               ret = send_clone(sctx, offset, len, clone_root);
+       } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
+               ret = send_update_extent(sctx, offset, len);
+       } else {
                while (pos < len) {
                        l = len - pos;
                        if (l > BTRFS_SEND_READ_SIZE)
@@ -3757,10 +3795,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
                        pos += ret;
                }
                ret = 0;
-       } else {
-               ret = send_clone(sctx, offset, len, clone_root);
        }
-
 out:
        return ret;
 }
@@ -4536,7 +4571,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        struct btrfs_fs_info *fs_info;
        struct btrfs_ioctl_send_args *arg = NULL;
        struct btrfs_key key;
-       struct file *filp = NULL;
        struct send_ctx *sctx = NULL;
        u32 i;
        u64 *clone_sources_tmp = NULL;
@@ -4561,6 +4595,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                goto out;
        }
 
+       if (arg->flags & ~BTRFS_SEND_FLAG_NO_FILE_DATA) {
+               ret = -EINVAL;
+               goto out;
+       }
+
        sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
        if (!sctx) {
                ret = -ENOMEM;
@@ -4572,6 +4611,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
        INIT_LIST_HEAD(&sctx->name_cache_list);
 
+       sctx->flags = arg->flags;
+
        sctx->send_filp = fget(arg->send_fd);
        if (IS_ERR(sctx->send_filp)) {
                ret = PTR_ERR(sctx->send_filp);
@@ -4673,8 +4714,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                goto out;
 
 out:
-       if (filp)
-               fput(filp);
        kfree(arg);
        vfree(clone_sources_tmp);
 
index 1bf4f32fd4ef28582c6d2401571bf2f7f37617a6..8bb18f7ccaa6f500a6a63fe5aad5bb049720725b 100644 (file)
@@ -86,6 +86,7 @@ enum btrfs_send_cmd {
        BTRFS_SEND_C_UTIMES,
 
        BTRFS_SEND_C_END,
+       BTRFS_SEND_C_UPDATE_EXTENT,
        __BTRFS_SEND_C_MAX,
 };
 #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
index d8982e9601d37862e6405df95df15c1ba34eecb9..db1ba9a2ed64b0336649449d4292012b5406bf68 100644 (file)
 #include <linux/slab.h>
 #include <linux/cleancache.h>
 #include <linux/ratelimit.h>
+#include <linux/btrfs.h>
 #include "compat.h"
 #include "delayed-inode.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "xattr.h"
 #include "volumes.h"
@@ -63,8 +63,7 @@
 static const struct super_operations btrfs_super_ops;
 static struct file_system_type btrfs_fs_type;
 
-static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
-                                     char nbuf[16])
+static const char *btrfs_decode_error(int errno, char nbuf[16])
 {
        char *errstr = NULL;
 
@@ -98,7 +97,7 @@ static void __save_error_info(struct btrfs_fs_info *fs_info)
         * today we only save the error info into ram.  Long term we'll
         * also send it down to the disk
         */
-       fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
+       set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
 }
 
 static void save_error_info(struct btrfs_fs_info *fs_info)
@@ -114,7 +113,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
        if (sb->s_flags & MS_RDONLY)
                return;
 
-       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
                sb->s_flags |= MS_RDONLY;
                printk(KERN_INFO "btrfs is forced readonly\n");
                /*
@@ -142,8 +141,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
        struct super_block *sb = fs_info->sb;
        char nbuf[16];
        const char *errstr;
-       va_list args;
-       va_start(args, fmt);
 
        /*
         * Special case: if the error is EROFS, and we're already
@@ -152,15 +149,18 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
        if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
                return;
 
-       errstr = btrfs_decode_error(fs_info, errno, nbuf);
+       errstr = btrfs_decode_error(errno, nbuf);
        if (fmt) {
-               struct va_format vaf = {
-                       .fmt = fmt,
-                       .va = &args,
-               };
+               struct va_format vaf;
+               va_list args;
+
+               va_start(args, fmt);
+               vaf.fmt = fmt;
+               vaf.va = &args;
 
                printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n",
                        sb->s_id, function, line, errstr, &vaf);
+               va_end(args);
        } else {
                printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
                        sb->s_id, function, line, errstr);
@@ -171,7 +171,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
                save_error_info(fs_info);
                btrfs_handle_error(fs_info);
        }
-       va_end(args);
 }
 
 static const char * const logtypes[] = {
@@ -261,7 +260,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
                char nbuf[16];
                const char *errstr;
 
-               errstr = btrfs_decode_error(root->fs_info, errno, nbuf);
+               errstr = btrfs_decode_error(errno, nbuf);
                btrfs_printk(root->fs_info,
                             "%s:%d: Aborting unused transaction(%s).\n",
                             function, line, errstr);
@@ -289,8 +288,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
        va_start(args, fmt);
        vaf.va = &args;
 
-       errstr = btrfs_decode_error(fs_info, errno, nbuf);
-       if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)
+       errstr = btrfs_decode_error(errno, nbuf);
+       if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR))
                panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
                        s_id, function, line, &vaf, errstr);
 
@@ -438,6 +437,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_compress_force:
                case Opt_compress_force_type:
                        compress_force = true;
+                       /* Fallthrough */
                case Opt_compress:
                case Opt_compress_type:
                        if (token == Opt_compress ||
@@ -519,7 +519,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_alloc_start:
                        num = match_strdup(&args[0]);
                        if (num) {
+                               mutex_lock(&info->chunk_mutex);
                                info->alloc_start = memparse(num, NULL);
+                               mutex_unlock(&info->chunk_mutex);
                                kfree(num);
                                printk(KERN_INFO
                                        "btrfs: allocations start at %llu\n",
@@ -876,7 +878,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 
        btrfs_wait_ordered_extents(root, 0);
 
-       trans = btrfs_attach_transaction(root);
+       trans = btrfs_attach_transaction_barrier(root);
        if (IS_ERR(trans)) {
                /* no transaction, don't bother */
                if (PTR_ERR(trans) == -ENOENT)
@@ -1289,7 +1291,9 @@ restore:
        fs_info->mount_opt = old_opts;
        fs_info->compress_type = old_compress_type;
        fs_info->max_inline = old_max_inline;
+       mutex_lock(&fs_info->chunk_mutex);
        fs_info->alloc_start = old_alloc_start;
+       mutex_unlock(&fs_info->chunk_mutex);
        btrfs_resize_thread_pool(fs_info,
                old_thread_pool_size, fs_info->thread_pool_size);
        fs_info->metadata_ratio = old_metadata_ratio;
@@ -1559,7 +1563,7 @@ static int btrfs_freeze(struct super_block *sb)
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = btrfs_sb(sb)->tree_root;
 
-       trans = btrfs_attach_transaction(root);
+       trans = btrfs_attach_transaction_barrier(root);
        if (IS_ERR(trans)) {
                /* no transaction, don't bother */
                if (PTR_ERR(trans) == -ENOENT)
@@ -1684,10 +1688,14 @@ static int __init init_btrfs_fs(void)
        if (err)
                goto free_delayed_inode;
 
-       err = btrfs_interface_init();
+       err = btrfs_delayed_ref_init();
        if (err)
                goto free_auto_defrag;
 
+       err = btrfs_interface_init();
+       if (err)
+               goto free_delayed_ref;
+
        err = register_filesystem(&btrfs_fs_type);
        if (err)
                goto unregister_ioctl;
@@ -1699,6 +1707,8 @@ static int __init init_btrfs_fs(void)
 
 unregister_ioctl:
        btrfs_interface_exit();
+free_delayed_ref:
+       btrfs_delayed_ref_exit();
 free_auto_defrag:
        btrfs_auto_defrag_exit();
 free_delayed_inode:
@@ -1720,6 +1730,7 @@ free_compress:
 static void __exit exit_btrfs_fs(void)
 {
        btrfs_destroy_cachep();
+       btrfs_delayed_ref_exit();
        btrfs_auto_defrag_exit();
        btrfs_delayed_inode_exit();
        ordered_data_exit();
index fc03aa60b68440862e4884f90cb88113d15610a7..955204ca04470dd5da2366efaf9917afec56fc57 100644 (file)
@@ -40,7 +40,6 @@ void put_transaction(struct btrfs_transaction *transaction)
        if (atomic_dec_and_test(&transaction->use_count)) {
                BUG_ON(!list_empty(&transaction->list));
                WARN_ON(transaction->delayed_refs.root.rb_node);
-               memset(transaction, 0, sizeof(*transaction));
                kmem_cache_free(btrfs_transaction_cachep, transaction);
        }
 }
@@ -51,6 +50,14 @@ static noinline void switch_commit_root(struct btrfs_root *root)
        root->commit_root = btrfs_root_node(root);
 }
 
+static inline int can_join_transaction(struct btrfs_transaction *trans,
+                                      int type)
+{
+       return !(trans->in_commit &&
+                type != TRANS_JOIN &&
+                type != TRANS_JOIN_NOLOCK);
+}
+
 /*
  * either allocate a new transaction or hop into the existing one
  */
@@ -62,7 +69,7 @@ static noinline int join_transaction(struct btrfs_root *root, int type)
        spin_lock(&fs_info->trans_lock);
 loop:
        /* The file system has been taken offline. No new transactions. */
-       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
                spin_unlock(&fs_info->trans_lock);
                return -EROFS;
        }
@@ -86,6 +93,10 @@ loop:
                        spin_unlock(&fs_info->trans_lock);
                        return cur_trans->aborted;
                }
+               if (!can_join_transaction(cur_trans, type)) {
+                       spin_unlock(&fs_info->trans_lock);
+                       return -EBUSY;
+               }
                atomic_inc(&cur_trans->use_count);
                atomic_inc(&cur_trans->num_writers);
                cur_trans->num_joined++;
@@ -114,7 +125,7 @@ loop:
                kmem_cache_free(btrfs_transaction_cachep, cur_trans);
                cur_trans = fs_info->running_transaction;
                goto loop;
-       } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+       } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
                spin_unlock(&fs_info->trans_lock);
                kmem_cache_free(btrfs_transaction_cachep, cur_trans);
                return -EROFS;
@@ -158,6 +169,7 @@ loop:
        spin_lock_init(&cur_trans->delayed_refs.lock);
 
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+       INIT_LIST_HEAD(&cur_trans->ordered_operations);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
        extent_io_tree_init(&cur_trans->dirty_pages,
                             fs_info->btree_inode->i_mapping);
@@ -302,7 +314,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
        int ret;
        u64 qgroup_reserved = 0;
 
-       if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+       if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
                return ERR_PTR(-EROFS);
 
        if (current->journal_info) {
@@ -360,8 +372,11 @@ again:
 
        do {
                ret = join_transaction(root, type);
-               if (ret == -EBUSY)
+               if (ret == -EBUSY) {
                        wait_current_trans(root);
+                       if (unlikely(type == TRANS_ATTACH))
+                               ret = -ENOENT;
+               }
        } while (ret == -EBUSY);
 
        if (ret < 0) {
@@ -383,9 +398,10 @@ again:
        h->block_rsv = NULL;
        h->orig_rsv = NULL;
        h->aborted = 0;
-       h->qgroup_reserved = qgroup_reserved;
+       h->qgroup_reserved = 0;
        h->delayed_ref_elem.seq = 0;
        h->type = type;
+       h->allocating_chunk = false;
        INIT_LIST_HEAD(&h->qgroup_ref_list);
        INIT_LIST_HEAD(&h->new_bgs);
 
@@ -401,6 +417,7 @@ again:
                h->block_rsv = &root->fs_info->trans_block_rsv;
                h->bytes_reserved = num_bytes;
        }
+       h->qgroup_reserved = qgroup_reserved;
 
 got_it:
        btrfs_record_root_in_trans(h, root);
@@ -452,11 +469,43 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
        return start_transaction(root, 0, TRANS_USERSPACE, 0);
 }
 
+/*
+ * btrfs_attach_transaction() - catch the running transaction
+ *
+ * It is used when we want to commit the current the transaction, but
+ * don't want to start a new one.
+ *
+ * Note: If this function return -ENOENT, it just means there is no
+ * running transaction. But it is possible that the inactive transaction
+ * is still in the memory, not fully on disk. If you hope there is no
+ * inactive transaction in the fs when -ENOENT is returned, you should
+ * invoke
+ *     btrfs_attach_transaction_barrier()
+ */
 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
 {
        return start_transaction(root, 0, TRANS_ATTACH, 0);
 }
 
+/*
+ * btrfs_attach_transaction() - catch the running transaction
+ *
+ * It is similar to the above function, the differentia is this one
+ * will wait for all the inactive transactions until they fully
+ * complete.
+ */
+struct btrfs_trans_handle *
+btrfs_attach_transaction_barrier(struct btrfs_root *root)
+{
+       struct btrfs_trans_handle *trans;
+
+       trans = start_transaction(root, 0, TRANS_ATTACH, 0);
+       if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
+               btrfs_wait_for_commit(root, 0);
+
+       return trans;
+}
+
 /* wait for a transaction commit to be fully complete */
 static noinline void wait_for_commit(struct btrfs_root *root,
                                    struct btrfs_transaction *commit)
@@ -645,12 +694,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                btrfs_run_delayed_iputs(root);
 
        if (trans->aborted ||
-           root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+           test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
                err = -EIO;
-       }
        assert_qgroups_uptodate(trans);
 
-       memset(trans, 0, sizeof(*trans));
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
        return err;
 }
@@ -961,10 +1008,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 }
 
 /*
- * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
- * otherwise every leaf in the btree is read and defragged.
+ * defrag a given btree.
+ * Every leaf in the btree is read and defragged.
  */
-int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
+int btrfs_defrag_root(struct btrfs_root *root)
 {
        struct btrfs_fs_info *info = root->fs_info;
        struct btrfs_trans_handle *trans;
@@ -978,7 +1025,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
                if (IS_ERR(trans))
                        return PTR_ERR(trans);
 
-               ret = btrfs_defrag_leaves(trans, root, cacheonly);
+               ret = btrfs_defrag_leaves(trans, root);
 
                btrfs_end_transaction(trans, root);
                btrfs_btree_balance_dirty(info->tree_root);
@@ -986,6 +1033,12 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 
                if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
                        break;
+
+               if (btrfs_defrag_cancelled(root->fs_info)) {
+                       printk(KERN_DEBUG "btrfs: defrag_root cancelled\n");
+                       ret = -EAGAIN;
+                       break;
+               }
        }
        root->defrag_running = 0;
        return ret;
@@ -1307,13 +1360,13 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
 struct btrfs_async_commit {
        struct btrfs_trans_handle *newtrans;
        struct btrfs_root *root;
-       struct delayed_work work;
+       struct work_struct work;
 };
 
 static void do_async_commit(struct work_struct *work)
 {
        struct btrfs_async_commit *ac =
-               container_of(work, struct btrfs_async_commit, work.work);
+               container_of(work, struct btrfs_async_commit, work);
 
        /*
         * We've got freeze protection passed with the transaction.
@@ -1341,7 +1394,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
        if (!ac)
                return -ENOMEM;
 
-       INIT_DELAYED_WORK(&ac->work, do_async_commit);
+       INIT_WORK(&ac->work, do_async_commit);
        ac->root = root;
        ac->newtrans = btrfs_join_transaction(root);
        if (IS_ERR(ac->newtrans)) {
@@ -1365,7 +1418,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
                        &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
                        1, _THIS_IP_);
 
-       schedule_delayed_work(&ac->work, 0);
+       schedule_work(&ac->work);
 
        /* wait for transaction to start and unblock */
        if (wait_for_unblock)
@@ -1428,7 +1481,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
        }
 
        if (flush_on_commit || snap_pending) {
-               btrfs_start_delalloc_inodes(root, 1);
+               ret = btrfs_start_delalloc_inodes(root, 1);
+               if (ret)
+                       return ret;
                btrfs_wait_ordered_extents(root, 1);
        }
 
@@ -1450,9 +1505,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
         * it here and no for sure that nothing new will be added
         * to the list
         */
-       btrfs_run_ordered_operations(root, 1);
+       ret = btrfs_run_ordered_operations(trans, root, 1);
 
-       return 0;
+       return ret;
 }
 
 /*
@@ -1473,27 +1528,35 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        int should_grow = 0;
        unsigned long now = get_seconds();
 
-       ret = btrfs_run_ordered_operations(root, 0);
+       ret = btrfs_run_ordered_operations(trans, root, 0);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
-               goto cleanup_transaction;
+               btrfs_end_transaction(trans, root);
+               return ret;
        }
 
        /* Stop the commit early if ->aborted is set */
        if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
                ret = cur_trans->aborted;
-               goto cleanup_transaction;
+               btrfs_end_transaction(trans, root);
+               return ret;
        }
 
        /* make a pass through all the delayed refs we have so far
         * any runnings procs may add more while we are here
         */
        ret = btrfs_run_delayed_refs(trans, root, 0);
-       if (ret)
-               goto cleanup_transaction;
+       if (ret) {
+               btrfs_end_transaction(trans, root);
+               return ret;
+       }
 
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
+       if (trans->qgroup_reserved) {
+               btrfs_qgroup_free(root, trans->qgroup_reserved);
+               trans->qgroup_reserved = 0;
+       }
 
        cur_trans = trans->transaction;
 
@@ -1507,8 +1570,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                btrfs_create_pending_block_groups(trans, root);
 
        ret = btrfs_run_delayed_refs(trans, root, 0);
-       if (ret)
-               goto cleanup_transaction;
+       if (ret) {
+               btrfs_end_transaction(trans, root);
+               return ret;
+       }
 
        spin_lock(&cur_trans->commit_lock);
        if (cur_trans->in_commit) {
@@ -1772,6 +1837,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 cleanup_transaction:
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
+       if (trans->qgroup_reserved) {
+               btrfs_qgroup_free(root, trans->qgroup_reserved);
+               trans->qgroup_reserved = 0;
+       }
        btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
 //     WARN_ON(1);
        if (current->journal_info == trans)
index 0e8aa1e6c2870274bc4d623e020bb2a668b238f8..5afd7b1dceacf27a92e2b026aa7d76432ee9f18a 100644 (file)
@@ -43,6 +43,7 @@ struct btrfs_transaction {
        wait_queue_head_t writer_wait;
        wait_queue_head_t commit_wait;
        struct list_head pending_snapshots;
+       struct list_head ordered_operations;
        struct btrfs_delayed_ref_root delayed_refs;
        int aborted;
 };
@@ -68,6 +69,7 @@ struct btrfs_trans_handle {
        struct btrfs_block_rsv *orig_rsv;
        short aborted;
        short adding_csums;
+       bool allocating_chunk;
        enum btrfs_trans_type type;
        /*
         * this root is only needed to validate that the root passed to
@@ -110,13 +112,15 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush(
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
+                                       struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root);
 
 int btrfs_add_dead_root(struct btrfs_root *root);
-int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
+int btrfs_defrag_root(struct btrfs_root *root);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root);
index 3b580ee8ab1ddc01eb06028c8790c815562c977a..94e05c1f118a7045b4bd40e2b03546347371b461 100644 (file)
 #include "transaction.h"
 #include "locking.h"
 
-/* defrag all the leaves in a given btree.  If cache_only == 1, don't read
- * things from disk, otherwise read all the leaves and try to get key order to
+/*
+ * Defrag all the leaves in a given btree.
+ * Read all the leaves and try to get key order to
  * better reflect disk order
  */
 
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root, int cache_only)
+                       struct btrfs_root *root)
 {
        struct btrfs_path *path = NULL;
        struct btrfs_key key;
@@ -41,9 +42,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
        u64 last_ret = 0;
        u64 min_trans = 0;
 
-       if (cache_only)
-               goto out;
-
        if (root->fs_info->extent_root == root) {
                /*
                 * there's recursion here right now in the tree locking,
@@ -86,11 +84,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
        }
 
        path->keep_locks = 1;
-       if (cache_only)
-               min_trans = root->defrag_trans_start;
 
-       ret = btrfs_search_forward(root, &key, NULL, path,
-                                  cache_only, min_trans);
+       ret = btrfs_search_forward(root, &key, NULL, path, min_trans);
        if (ret < 0)
                goto out;
        if (ret > 0) {
@@ -109,11 +104,11 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                goto out;
        }
        path->slots[1] = btrfs_header_nritems(path->nodes[1]);
-       next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
+       next_key_ret = btrfs_find_next_key(root, path, &key, 1,
                                           min_trans);
        ret = btrfs_realloc_node(trans, root,
                                 path->nodes[1], 0,
-                                cache_only, &last_ret,
+                                &last_ret,
                                 &root->defrag_progress);
        if (ret) {
                WARN_ON(ret == -EAGAIN);
index 9027bb1e74660758328a3d133fe58b59ddc460e6..1a79087c45751a25526c09c37bcabc741550c7f6 100644 (file)
@@ -278,8 +278,7 @@ static int process_one_buffer(struct btrfs_root *log,
                              struct walk_control *wc, u64 gen)
 {
        if (wc->pin)
-               btrfs_pin_extent_for_log_replay(wc->trans,
-                                               log->fs_info->extent_root,
+               btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
                                                eb->start, eb->len);
 
        if (btrfs_buffer_uptodate(eb, gen, 0)) {
@@ -2281,6 +2280,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        unsigned long log_transid = 0;
 
        mutex_lock(&root->log_mutex);
+       log_transid = root->log_transid;
        index1 = root->log_transid % 2;
        if (atomic_read(&root->log_commit[index1])) {
                wait_log_commit(trans, root, root->log_transid);
@@ -2308,11 +2308,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        /* bail out if we need to do a full commit */
        if (root->fs_info->last_trans_log_full_commit == trans->transid) {
                ret = -EAGAIN;
+               btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&root->log_mutex);
                goto out;
        }
 
-       log_transid = root->log_transid;
        if (log_transid % 2 == 0)
                mark = EXTENT_DIRTY;
        else
@@ -2324,6 +2324,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
+               btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&root->log_mutex);
                goto out;
        }
@@ -2363,6 +2364,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                }
                root->fs_info->last_trans_log_full_commit = trans->transid;
                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+               btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
                ret = -EAGAIN;
                goto out;
@@ -2373,6 +2375,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
                wait_log_commit(trans, log_root_tree,
                                log_root_tree->log_transid);
+               btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
                ret = 0;
                goto out;
@@ -2392,6 +2395,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         */
        if (root->fs_info->last_trans_log_full_commit == trans->transid) {
                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+               btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
                ret = -EAGAIN;
                goto out_wake_log_root;
@@ -2402,10 +2406,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                                EXTENT_DIRTY | EXTENT_NEW);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
+               btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
                goto out_wake_log_root;
        }
        btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+       btrfs_wait_logged_extents(log, log_transid);
 
        btrfs_set_super_log_root(root->fs_info->super_for_commit,
                                log_root_tree->node->start);
@@ -2475,6 +2481,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
                                  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
        }
 
+       /*
+        * We may have short-circuited the log tree with the full commit logic
+        * and left ordered extents on our list, so clear these out to keep us
+        * from leaking inodes and memory.
+        */
+       btrfs_free_logged_extents(log, 0);
+       btrfs_free_logged_extents(log, 1);
+
        free_extent_buffer(log->node);
        kfree(log);
 }
@@ -2724,7 +2738,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
        path->keep_locks = 1;
 
        ret = btrfs_search_forward(root, &min_key, &max_key,
-                                  path, 0, trans->transid);
+                                  path, trans->transid);
 
        /*
         * we didn't find anything from this transaction, see if there
@@ -3271,14 +3285,18 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
        struct btrfs_root *log = root->log_root;
        struct btrfs_file_extent_item *fi;
        struct extent_buffer *leaf;
+       struct btrfs_ordered_extent *ordered;
        struct list_head ordered_sums;
        struct btrfs_map_token token;
        struct btrfs_key key;
-       u64 csum_offset = em->mod_start - em->start;
-       u64 csum_len = em->mod_len;
+       u64 mod_start = em->mod_start;
+       u64 mod_len = em->mod_len;
+       u64 csum_offset;
+       u64 csum_len;
        u64 extent_offset = em->start - em->orig_start;
        u64 block_len;
        int ret;
+       int index = log->log_transid % 2;
        bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
        INIT_LIST_HEAD(&ordered_sums);
@@ -3362,6 +3380,92 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
                csum_len = block_len;
        }
 
+       /*
+        * First check and see if our csums are on our outstanding ordered
+        * extents.
+        */
+again:
+       spin_lock_irq(&log->log_extents_lock[index]);
+       list_for_each_entry(ordered, &log->logged_list[index], log_list) {
+               struct btrfs_ordered_sum *sum;
+
+               if (!mod_len)
+                       break;
+
+               if (ordered->inode != inode)
+                       continue;
+
+               if (ordered->file_offset + ordered->len <= mod_start ||
+                   mod_start + mod_len <= ordered->file_offset)
+                       continue;
+
+               /*
+                * We are going to copy all the csums on this ordered extent, so
+                * go ahead and adjust mod_start and mod_len in case this
+                * ordered extent has already been logged.
+                */
+               if (ordered->file_offset > mod_start) {
+                       if (ordered->file_offset + ordered->len >=
+                           mod_start + mod_len)
+                               mod_len = ordered->file_offset - mod_start;
+                       /*
+                        * If we have this case
+                        *
+                        * |--------- logged extent ---------|
+                        *       |----- ordered extent ----|
+                        *
+                        * Just don't mess with mod_start and mod_len, we'll
+                        * just end up logging more csums than we need and it
+                        * will be ok.
+                        */
+               } else {
+                       if (ordered->file_offset + ordered->len <
+                           mod_start + mod_len) {
+                               mod_len = (mod_start + mod_len) -
+                                       (ordered->file_offset + ordered->len);
+                               mod_start = ordered->file_offset +
+                                       ordered->len;
+                       } else {
+                               mod_len = 0;
+                       }
+               }
+
+               /*
+                * To keep us from looping for the above case of an ordered
+                * extent that falls inside of the logged extent.
+                */
+               if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
+                                    &ordered->flags))
+                       continue;
+               atomic_inc(&ordered->refs);
+               spin_unlock_irq(&log->log_extents_lock[index]);
+               /*
+                * we've dropped the lock, we must either break or
+                * start over after this.
+                */
+
+               wait_event(ordered->wait, ordered->csum_bytes_left == 0);
+
+               list_for_each_entry(sum, &ordered->list, list) {
+                       ret = btrfs_csum_file_blocks(trans, log, sum);
+                       if (ret) {
+                               btrfs_put_ordered_extent(ordered);
+                               goto unlocked;
+                       }
+               }
+               btrfs_put_ordered_extent(ordered);
+               goto again;
+
+       }
+       spin_unlock_irq(&log->log_extents_lock[index]);
+unlocked:
+
+       if (!mod_len || ret)
+               return ret;
+
+       csum_offset = mod_start - em->start;
+       csum_len = mod_len;
+
        /* block start is already adjusted for the file extent offset. */
        ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
                                       em->block_start + csum_offset,
@@ -3393,6 +3497,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
        struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
        u64 test_gen;
        int ret = 0;
+       int num = 0;
 
        INIT_LIST_HEAD(&extents);
 
@@ -3401,16 +3506,31 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 
        list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
                list_del_init(&em->list);
+
+               /*
+                * Just an arbitrary number, this can be really CPU intensive
+                * once we start getting a lot of extents, and really once we
+                * have a bunch of extents we just want to commit since it will
+                * be faster.
+                */
+               if (++num > 32768) {
+                       list_del_init(&tree->modified_extents);
+                       ret = -EFBIG;
+                       goto process;
+               }
+
                if (em->generation <= test_gen)
                        continue;
                /* Need a ref to keep it from getting evicted from cache */
                atomic_inc(&em->refs);
                set_bit(EXTENT_FLAG_LOGGING, &em->flags);
                list_add_tail(&em->list, &extents);
+               num++;
        }
 
        list_sort(NULL, &extents, extent_cmp);
 
+process:
        while (!list_empty(&extents)) {
                em = list_entry(extents.next, struct extent_map, list);
 
@@ -3513,6 +3633,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
        mutex_lock(&BTRFS_I(inode)->log_mutex);
 
+       btrfs_get_logged_extents(log, inode);
+
        /*
         * a brute force approach to making sure we get the most uptodate
         * copies of everything.
@@ -3558,7 +3680,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        while (1) {
                ins_nr = 0;
                ret = btrfs_search_forward(root, &min_key, &max_key,
-                                          path, 0, trans->transid);
+                                          path, trans->transid);
                if (ret != 0)
                        break;
 again:
@@ -3656,6 +3778,8 @@ log_extents:
        BTRFS_I(inode)->logged_trans = trans->transid;
        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
 out_unlock:
+       if (err)
+               btrfs_free_logged_extents(log, log->log_transid);
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
 
        btrfs_free_path(path);
@@ -3822,7 +3946,6 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 end_trans:
        dput(old_parent);
        if (ret < 0) {
-               WARN_ON(ret != -ENOSPC);
                root->fs_info->last_trans_log_full_commit = trans->transid;
                ret = 1;
        }
index 5cbb7f4b16720fc3c3442ffb09752983195836a5..72b1cf1b2b5ee5b586612cf8bf952c805670e66e 100644 (file)
@@ -792,26 +792,76 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
        return ret;
 }
 
+/*
+ * Look for a btrfs signature on a device. This may be called out of the mount path
+ * and we are not allowed to call set_blocksize during the scan. The superblock
+ * is read via pagecache
+ */
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                          struct btrfs_fs_devices **fs_devices_ret)
 {
        struct btrfs_super_block *disk_super;
        struct block_device *bdev;
-       struct buffer_head *bh;
-       int ret;
+       struct page *page;
+       void *p;
+       int ret = -EINVAL;
        u64 devid;
        u64 transid;
        u64 total_devices;
+       u64 bytenr;
+       pgoff_t index;
 
+       /*
+        * we would like to check all the supers, but that would make
+        * a btrfs mount succeed after a mkfs from a different FS.
+        * So, we need to add a special mount option to scan for
+        * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+        */
+       bytenr = btrfs_sb_offset(0);
        flags |= FMODE_EXCL;
        mutex_lock(&uuid_mutex);
-       ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);
-       if (ret)
+
+       bdev = blkdev_get_by_path(path, flags, holder);
+
+       if (IS_ERR(bdev)) {
+               ret = PTR_ERR(bdev);
+               printk(KERN_INFO "btrfs: open %s failed\n", path);
                goto error;
-       disk_super = (struct btrfs_super_block *)bh->b_data;
+       }
+
+       /* make sure our super fits in the device */
+       if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
+               goto error_bdev_put;
+
+       /* make sure our super fits in the page */
+       if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
+               goto error_bdev_put;
+
+       /* make sure our super doesn't straddle pages on disk */
+       index = bytenr >> PAGE_CACHE_SHIFT;
+       if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
+               goto error_bdev_put;
+
+       /* pull in the page with our super */
+       page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+                                  index, GFP_NOFS);
+
+       if (IS_ERR_OR_NULL(page))
+               goto error_bdev_put;
+
+       p = kmap(page);
+
+       /* align our pointer to the offset of the super block */
+       disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
+
+       if (btrfs_super_bytenr(disk_super) != bytenr ||
+           disk_super->magic != cpu_to_le64(BTRFS_MAGIC))
+               goto error_unmap;
+
        devid = btrfs_stack_device_id(&disk_super->dev_item);
        transid = btrfs_super_generation(disk_super);
        total_devices = btrfs_super_num_devices(disk_super);
+
        if (disk_super->label[0]) {
                if (disk_super->label[BTRFS_LABEL_SIZE - 1])
                        disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
@@ -819,12 +869,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        } else {
                printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
        }
+
        printk(KERN_CONT "devid %llu transid %llu %s\n",
               (unsigned long long)devid, (unsigned long long)transid, path);
+
        ret = device_list_add(path, disk_super, devid, fs_devices_ret);
        if (!ret && fs_devices_ret)
                (*fs_devices_ret)->total_devices = total_devices;
-       brelse(bh);
+
+error_unmap:
+       kunmap(page);
+       page_cache_release(page);
+
+error_bdev_put:
        blkdev_put(bdev, flags);
 error:
        mutex_unlock(&uuid_mutex);
@@ -1372,14 +1429,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        u64 devid;
        u64 num_devices;
        u8 *dev_uuid;
+       unsigned seq;
        int ret = 0;
        bool clear_super = false;
 
        mutex_lock(&uuid_mutex);
 
-       all_avail = root->fs_info->avail_data_alloc_bits |
-               root->fs_info->avail_system_alloc_bits |
-               root->fs_info->avail_metadata_alloc_bits;
+       do {
+               seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+               all_avail = root->fs_info->avail_data_alloc_bits |
+                           root->fs_info->avail_system_alloc_bits |
+                           root->fs_info->avail_metadata_alloc_bits;
+       } while (read_seqretry(&root->fs_info->profiles_lock, seq));
 
        num_devices = root->fs_info->fs_devices->num_devices;
        btrfs_dev_replace_lock(&root->fs_info->dev_replace);
@@ -2616,7 +2678,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
        chunk_used = btrfs_block_group_used(&cache->item);
 
        if (bargs->usage == 0)
-               user_thresh = 0;
+               user_thresh = 1;
        else if (bargs->usage > 100)
                user_thresh = cache->key.offset;
        else
@@ -2985,6 +3047,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        int mixed = 0;
        int ret;
        u64 num_devices;
+       unsigned seq;
 
        if (btrfs_fs_closing(fs_info) ||
            atomic_read(&fs_info->balance_pause_req) ||
@@ -3068,22 +3131,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        /* allow to reduce meta or sys integrity only if force set */
        allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
                        BTRFS_BLOCK_GROUP_RAID10;
-       if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-            (fs_info->avail_system_alloc_bits & allowed) &&
-            !(bctl->sys.target & allowed)) ||
-           ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-            (fs_info->avail_metadata_alloc_bits & allowed) &&
-            !(bctl->meta.target & allowed))) {
-               if (bctl->flags & BTRFS_BALANCE_FORCE) {
-                       printk(KERN_INFO "btrfs: force reducing metadata "
-                              "integrity\n");
-               } else {
-                       printk(KERN_ERR "btrfs: balance will reduce metadata "
-                              "integrity, use force if you want this\n");
-                       ret = -EINVAL;
-                       goto out;
+       do {
+               seq = read_seqbegin(&fs_info->profiles_lock);
+
+               if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                    (fs_info->avail_system_alloc_bits & allowed) &&
+                    !(bctl->sys.target & allowed)) ||
+                   ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                    (fs_info->avail_metadata_alloc_bits & allowed) &&
+                    !(bctl->meta.target & allowed))) {
+                       if (bctl->flags & BTRFS_BALANCE_FORCE) {
+                               printk(KERN_INFO "btrfs: force reducing metadata "
+                                      "integrity\n");
+                       } else {
+                               printk(KERN_ERR "btrfs: balance will reduce metadata "
+                                      "integrity, use force if you want this\n");
+                               ret = -EINVAL;
+                               goto out;
+                       }
                }
-       }
+       } while (read_seqretry(&fs_info->profiles_lock, seq));
 
        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
                int num_tolerated_disk_barrier_failures;
@@ -3127,6 +3194,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        mutex_lock(&fs_info->balance_mutex);
        atomic_dec(&fs_info->balance_running);
 
+       if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+               fs_info->num_tolerated_disk_barrier_failures =
+                       btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+       }
+
        if (bargs) {
                memset(bargs, 0, sizeof(*bargs));
                update_ioctl_balance_args(fs_info, 0, bargs);
@@ -3137,11 +3209,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                __cancel_balance(fs_info);
        }
 
-       if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
-               fs_info->num_tolerated_disk_barrier_failures =
-                       btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
-       }
-
        wake_up(&fs_info->balance_wait_q);
 
        return ret;
@@ -3504,13 +3571,48 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
 }
 
 struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
-       { 2, 1, 0, 4, 2, 2 /* raid10 */ },
-       { 1, 1, 2, 2, 2, 2 /* raid1 */ },
-       { 1, 2, 1, 1, 1, 2 /* dup */ },
-       { 1, 1, 0, 2, 1, 1 /* raid0 */ },
-       { 1, 1, 1, 1, 1, 1 /* single */ },
+       [BTRFS_RAID_RAID10] = {
+               .sub_stripes    = 2,
+               .dev_stripes    = 1,
+               .devs_max       = 0,    /* 0 == as many as possible */
+               .devs_min       = 4,
+               .devs_increment = 2,
+               .ncopies        = 2,
+       },
+       [BTRFS_RAID_RAID1] = {
+               .sub_stripes    = 1,
+               .dev_stripes    = 1,
+               .devs_max       = 2,
+               .devs_min       = 2,
+               .devs_increment = 2,
+               .ncopies        = 2,
+       },
+       [BTRFS_RAID_DUP] = {
+               .sub_stripes    = 1,
+               .dev_stripes    = 2,
+               .devs_max       = 1,
+               .devs_min       = 1,
+               .devs_increment = 1,
+               .ncopies        = 2,
+       },
+       [BTRFS_RAID_RAID0] = {
+               .sub_stripes    = 1,
+               .dev_stripes    = 1,
+               .devs_max       = 0,
+               .devs_min       = 2,
+               .devs_increment = 1,
+               .ncopies        = 1,
+       },
+       [BTRFS_RAID_SINGLE] = {
+               .sub_stripes    = 1,
+               .dev_stripes    = 1,
+               .devs_max       = 1,
+               .devs_min       = 1,
+               .devs_increment = 1,
+               .ncopies        = 1,
+       },
 };
-
 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                               struct btrfs_root *extent_root,
                               struct map_lookup **map_ret,
@@ -3631,12 +3733,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
                        continue;
 
+               if (ndevs == fs_devices->rw_devices) {
+                       WARN(1, "%s: found more than %llu devices\n",
+                            __func__, fs_devices->rw_devices);
+                       break;
+               }
                devices_info[ndevs].dev_offset = dev_offset;
                devices_info[ndevs].max_avail = max_avail;
                devices_info[ndevs].total_avail = total_avail;
                devices_info[ndevs].dev = device;
                ++ndevs;
-               WARN_ON(ndevs > fs_devices->rw_devices);
        }
 
        /*
@@ -3718,15 +3824,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        write_lock(&em_tree->lock);
        ret = add_extent_mapping(em_tree, em);
        write_unlock(&em_tree->lock);
-       free_extent_map(em);
-       if (ret)
-               goto error;
-
-       ret = btrfs_make_block_group(trans, extent_root, 0, type,
-                                    BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-                                    start, num_bytes);
-       if (ret)
+       if (ret) {
+               free_extent_map(em);
                goto error;
+       }
 
        for (i = 0; i < map->num_stripes; ++i) {
                struct btrfs_device *device;
@@ -3739,15 +3840,42 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                                info->chunk_root->root_key.objectid,
                                BTRFS_FIRST_CHUNK_TREE_OBJECTID,
                                start, dev_offset, stripe_size);
-               if (ret) {
-                       btrfs_abort_transaction(trans, extent_root, ret);
-                       goto error;
-               }
+               if (ret)
+                       goto error_dev_extent;
        }
 
+       ret = btrfs_make_block_group(trans, extent_root, 0, type,
+                                    BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+                                    start, num_bytes);
+       if (ret) {
+               i = map->num_stripes - 1;
+               goto error_dev_extent;
+       }
+
+       free_extent_map(em);
        kfree(devices_info);
        return 0;
 
+error_dev_extent:
+       for (; i >= 0; i--) {
+               struct btrfs_device *device;
+               int err;
+
+               device = map->stripes[i].dev;
+               err = btrfs_free_dev_extent(trans, device, start);
+               if (err) {
+                       btrfs_abort_transaction(trans, extent_root, err);
+                       break;
+               }
+       }
+       write_lock(&em_tree->lock);
+       remove_extent_mapping(em_tree, em);
+       write_unlock(&em_tree->lock);
+
+       /* One for our allocation */
+       free_extent_map(em);
+       /* One for the tree reference */
+       free_extent_map(em);
 error:
        kfree(map);
        kfree(devices_info);
@@ -3887,10 +4015,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
        if (ret)
                return ret;
 
-       alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
-                               fs_info->avail_metadata_alloc_bits;
-       alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
-
+       alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
        ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
                                  &stripe_size, chunk_offset, alloc_profile);
        if (ret)
@@ -3898,10 +4023,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 
        sys_chunk_offset = chunk_offset + chunk_size;
 
-       alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
-                               fs_info->avail_system_alloc_bits;
-       alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
-
+       alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
        ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
                                  &sys_chunk_size, &sys_stripe_size,
                                  sys_chunk_offset, alloc_profile);
index d3c3939ac7512e405995793e47b4ca2c3ce640dc..12bb84166a5f11d616771f295a62fccfdb0b2645 100644 (file)
@@ -21,8 +21,8 @@
 
 #include <linux/bio.h>
 #include <linux/sort.h>
+#include <linux/btrfs.h>
 #include "async-thread.h"
-#include "ioctl.h"
 
 #define BTRFS_STRIPE_LEN       (64 * 1024)
 
diff --git a/include/linux/btrfs.h b/include/linux/btrfs.h
new file mode 100644 (file)
index 0000000..22d7991
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef _LINUX_BTRFS_H
+#define _LINUX_BTRFS_H
+
+#include <uapi/linux/btrfs.h>
+
+#endif /* _LINUX_BTRFS_H */
index 4e67194fd2c3c669a3be0f0a6965da816b36457c..5c8a1d25e21c15060f3740fe2e83c5c291b26d82 100644 (file)
@@ -68,6 +68,7 @@ header-y += blkpg.h
 header-y += blktrace_api.h
 header-y += bpqether.h
 header-y += bsg.h
+header-y += btrfs.h
 header-y += can.h
 header-y += capability.h
 header-y += capi.h
similarity index 96%
rename from fs/btrfs/ioctl.h
rename to include/uapi/linux/btrfs.h
index dabca9cc8c2ebe72a7c440d2147e985bcc6b3c2c..fa3a5f9338fc589dc8c3d9bd7f32c723711e0c33 100644 (file)
@@ -16,8 +16,9 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#ifndef __IOCTL_
-#define __IOCTL_
+#ifndef _UAPI_LINUX_BTRFS_H
+#define _UAPI_LINUX_BTRFS_H
+#include <linux/types.h>
 #include <linux/ioctl.h>
 
 #define BTRFS_IOCTL_MAGIC 0x94
@@ -406,6 +407,13 @@ struct btrfs_ioctl_received_subvol_args {
        __u64   reserved[16];           /* in */
 };
 
+/*
+ * Caller doesn't want file data in the send stream, even if the
+ * search of clone sources doesn't find an extent. UPDATE_EXTENT
+ * commands will be sent instead of WRITE commands.
+ */
+#define BTRFS_SEND_FLAG_NO_FILE_DATA     0x1
+
 struct btrfs_ioctl_send_args {
        __s64 send_fd;                  /* in */
        __u64 clone_sources_count;      /* in */
@@ -494,9 +502,13 @@ struct btrfs_ioctl_send_args {
                               struct btrfs_ioctl_qgroup_create_args)
 #define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \
                               struct btrfs_ioctl_qgroup_limit_args)
+#define BTRFS_IOC_GET_FSLABEL _IOR(BTRFS_IOCTL_MAGIC, 49, \
+                                  char[BTRFS_LABEL_SIZE])
+#define BTRFS_IOC_SET_FSLABEL _IOW(BTRFS_IOCTL_MAGIC, 50, \
+                                  char[BTRFS_LABEL_SIZE])
 #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
                                      struct btrfs_ioctl_get_dev_stats)
 #define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
                                    struct btrfs_ioctl_dev_replace_args)
 
-#endif
+#endif /* _UAPI_LINUX_BTRFS_H */