]> Pileus Git - ~andy/linux/commitdiff
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 10 Oct 2012 01:49:20 +0000 (10:49 +0900)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 10 Oct 2012 01:49:20 +0000 (10:49 +0900)
Pull btrfs update from Chris Mason:
 "This is a large pull, with the bulk of the updates coming from:

   - Hole punching

   - send/receive fixes

   - fsync performance

   - Disk format extension allowing more hardlinks inside a single
     directory (btrfs-progs patch required to enable the compat bit for
     this one)

  I'm cooking more unrelated RAID code, but I wanted to make sure this
  original batch makes it in.  The largest updates here are relatively
  old and have been in testing for some time."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (121 commits)
  btrfs: init ref_index to zero in add_inode_ref
  Btrfs: remove repeated eb->pages check in, disk-io.c/csum_dirty_buffer
  Btrfs: fix page leakage
  Btrfs: do not warn_on when we cannot alloc a page for an extent buffer
  Btrfs: don't bug on enomem in readpage
  Btrfs: cleanup pages properly when ENOMEM in compression
  Btrfs: make filesystem read-only when submitting barrier fails
  Btrfs: detect corrupted filesystem after write I/O errors
  Btrfs: make compress and nodatacow mount options mutually exclusive
  btrfs: fix message printing
  Btrfs: don't bother committing delayed inode updates when fsyncing
  btrfs: move inline function code to header file
  Btrfs: remove unnecessary IS_ERR in bio_readpage_error()
  btrfs: remove unused function btrfs_insert_some_items()
  Btrfs: don't commit instead of overcommitting
  Btrfs: confirmation of value is added before trace_btrfs_get_extent() is called
  Btrfs: be smarter about dropping things from the tree log
  Btrfs: don't lookup csums for prealloc extents
  Btrfs: cache extent state when writing out dirty metadata pages
  Btrfs: do not hold the file extent leaf locked when adding extent item
  ...

39 files changed:
fs/btrfs/backref.c
fs/btrfs/backref.h
fs/btrfs/btrfs_inode.h
fs/btrfs/check-integrity.c
fs/btrfs/compression.c
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-inode.c
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/extent_map.c
fs/btrfs/extent_map.h
fs/btrfs/file-item.c
fs/btrfs/file.c
fs/btrfs/free-space-cache.c
fs/btrfs/hash.h
fs/btrfs/inode-item.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h
fs/btrfs/qgroup.c
fs/btrfs/relocation.c
fs/btrfs/root-tree.c
fs/btrfs/scrub.c
fs/btrfs/send.c
fs/btrfs/send.h
fs/btrfs/super.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h
fs/btrfs/tree-log.c
fs/btrfs/ulist.c
fs/btrfs/ulist.h
fs/btrfs/volumes.c
fs/btrfs/zlib.c
include/trace/events/btrfs.h

index ff6475f409d64aaade5499f9d0cee64c2989a80c..f3187938e081c7dcbf842d424f5de6ed06d3f93f 100644 (file)
@@ -16,6 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/vmalloc.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "backref.h"
@@ -231,7 +232,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
                        }
                        if (!ret) {
                                ret = ulist_add(parents, eb->start,
-                                               (unsigned long)eie, GFP_NOFS);
+                                               (uintptr_t)eie, GFP_NOFS);
                                if (ret < 0)
                                        break;
                                if (!extent_item_pos) {
@@ -363,8 +364,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                ULIST_ITER_INIT(&uiter);
                node = ulist_next(parents, &uiter);
                ref->parent = node ? node->val : 0;
-               ref->inode_list =
-                       node ? (struct extent_inode_elem *)node->aux : 0;
+               ref->inode_list = node ?
+                       (struct extent_inode_elem *)(uintptr_t)node->aux : 0;
 
                /* additional parents require new refs being added here */
                while ((node = ulist_next(parents, &uiter))) {
@@ -375,8 +376,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                        }
                        memcpy(new_ref, ref, sizeof(*ref));
                        new_ref->parent = node->val;
-                       new_ref->inode_list =
-                                       (struct extent_inode_elem *)node->aux;
+                       new_ref->inode_list = (struct extent_inode_elem *)
+                                                       (uintptr_t)node->aux;
                        list_add(&new_ref->list, &ref->list);
                }
                ulist_reinit(parents);
@@ -914,8 +915,8 @@ again:
                                free_extent_buffer(eb);
                        }
                        ret = ulist_add_merge(refs, ref->parent,
-                                             (unsigned long)ref->inode_list,
-                                             (unsigned long *)&eie, GFP_NOFS);
+                                             (uintptr_t)ref->inode_list,
+                                             (u64 *)&eie, GFP_NOFS);
                        if (!ret && extent_item_pos) {
                                /*
                                 * we've recorded that parent, so we must extend
@@ -959,7 +960,7 @@ static void free_leaf_list(struct ulist *blocks)
        while ((node = ulist_next(blocks, &uiter))) {
                if (!node->aux)
                        continue;
-               eie = (struct extent_inode_elem *)node->aux;
+               eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
                for (; eie; eie = eie_next) {
                        eie_next = eie->next;
                        kfree(eie);
@@ -1108,26 +1109,80 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
                                found_key);
 }
 
-/*
- * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
- * of the path are separated by '/' and the path is guaranteed to be
- * 0-terminated. the path is only given within the current file system.
- * Therefore, it never starts with a '/'. the caller is responsible to provide
- * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
- * the start point of the resulting string is returned. this pointer is within
- * dest, normally.
- * in case the path buffer would overflow, the pointer is decremented further
- * as if output was written to the buffer, though no more output is actually
- * generated. that way, the caller can determine how much space would be
- * required for the path to fit into the buffer. in that case, the returned
- * value will be smaller than dest. callers must check this!
- */
-char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
-                        struct btrfs_inode_ref *iref,
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+                         u64 start_off, struct btrfs_path *path,
+                         struct btrfs_inode_extref **ret_extref,
+                         u64 *found_off)
+{
+       int ret, slot;
+       struct btrfs_key key;
+       struct btrfs_key found_key;
+       struct btrfs_inode_extref *extref;
+       struct extent_buffer *leaf;
+       unsigned long ptr;
+
+       key.objectid = inode_objectid;
+       btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+       key.offset = start_off;
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               return ret;
+
+       while (1) {
+               leaf = path->nodes[0];
+               slot = path->slots[0];
+               if (slot >= btrfs_header_nritems(leaf)) {
+                       /*
+                        * If the item at offset is not found,
+                        * btrfs_search_slot will point us to the slot
+                        * where it should be inserted. In our case
+                        * that will be the slot directly before the
+                        * next INODE_REF_KEY_V2 item. In the case
+                        * that we're pointing to the last slot in a
+                        * leaf, we must move one leaf over.
+                        */
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret) {
+                               if (ret >= 1)
+                                       ret = -ENOENT;
+                               break;
+                       }
+                       continue;
+               }
+
+               btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+               /*
+                * Check that we're still looking at an extended ref key for
+                * this particular objectid. If we have different
+                * objectid or type then there are no more to be found
+                * in the tree and we can exit.
+                */
+               ret = -ENOENT;
+               if (found_key.objectid != inode_objectid)
+                       break;
+               if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
+                       break;
+
+               ret = 0;
+               ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+               extref = (struct btrfs_inode_extref *)ptr;
+               *ret_extref = extref;
+               if (found_off)
+                       *found_off = found_key.offset;
+               break;
+       }
+
+       return ret;
+}
+
+static char *ref_to_path(struct btrfs_root *fs_root,
+                        struct btrfs_path *path,
+                        u32 name_len, unsigned long name_off,
                         struct extent_buffer *eb_in, u64 parent,
                         char *dest, u32 size)
 {
-       u32 len;
        int slot;
        u64 next_inum;
        int ret;
@@ -1135,17 +1190,17 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
        struct extent_buffer *eb = eb_in;
        struct btrfs_key found_key;
        int leave_spinning = path->leave_spinning;
+       struct btrfs_inode_ref *iref;
 
        if (bytes_left >= 0)
                dest[bytes_left] = '\0';
 
        path->leave_spinning = 1;
        while (1) {
-               len = btrfs_inode_ref_name_len(eb, iref);
-               bytes_left -= len;
+               bytes_left -= name_len;
                if (bytes_left >= 0)
                        read_extent_buffer(eb, dest + bytes_left,
-                                               (unsigned long)(iref + 1), len);
+                                          name_off, name_len);
                if (eb != eb_in) {
                        btrfs_tree_read_unlock_blocking(eb);
                        free_extent_buffer(eb);
@@ -1155,6 +1210,7 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
                        ret = -ENOENT;
                if (ret)
                        break;
+
                next_inum = found_key.offset;
 
                /* regular exit ahead */
@@ -1170,8 +1226,11 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
                        btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
                }
                btrfs_release_path(path);
-
                iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+               name_len = btrfs_inode_ref_name_len(eb, iref);
+               name_off = (unsigned long)(iref + 1);
+
                parent = next_inum;
                --bytes_left;
                if (bytes_left >= 0)
@@ -1187,13 +1246,40 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
        return dest + bytes_left;
 }
 
+/*
+ * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
+ * of the path are separated by '/' and the path is guaranteed to be
+ * 0-terminated. the path is only given within the current file system.
+ * Therefore, it never starts with a '/'. the caller is responsible to provide
+ * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
+ * the start point of the resulting string is returned. this pointer is within
+ * dest, normally.
+ * in case the path buffer would overflow, the pointer is decremented further
+ * as if output was written to the buffer, though no more output is actually
+ * generated. that way, the caller can determine how much space would be
+ * required for the path to fit into the buffer. in that case, the returned
+ * value will be smaller than dest. callers must check this!
+ */
+char *btrfs_iref_to_path(struct btrfs_root *fs_root,
+                        struct btrfs_path *path,
+                        struct btrfs_inode_ref *iref,
+                        struct extent_buffer *eb_in, u64 parent,
+                        char *dest, u32 size)
+{
+       return ref_to_path(fs_root, path,
+                          btrfs_inode_ref_name_len(eb_in, iref),
+                          (unsigned long)(iref + 1),
+                          eb_in, parent, dest, size);
+}
+
 /*
  * this makes the path point to (logical EXTENT_ITEM *)
  * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
  * tree blocks and <0 on error.
  */
 int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
-                       struct btrfs_path *path, struct btrfs_key *found_key)
+                       struct btrfs_path *path, struct btrfs_key *found_key,
+                       u64 *flags_ret)
 {
        int ret;
        u64 flags;
@@ -1237,10 +1323,17 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
                 (unsigned long long)found_key->objectid,
                 (unsigned long long)found_key->offset,
                 (unsigned long long)flags, item_size);
-       if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
-               return BTRFS_EXTENT_FLAG_TREE_BLOCK;
-       if (flags & BTRFS_EXTENT_FLAG_DATA)
-               return BTRFS_EXTENT_FLAG_DATA;
+
+       WARN_ON(!flags_ret);
+       if (flags_ret) {
+               if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+                       *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+               else if (flags & BTRFS_EXTENT_FLAG_DATA)
+                       *flags_ret = BTRFS_EXTENT_FLAG_DATA;
+               else
+                       BUG_ON(1);
+               return 0;
+       }
 
        return -EIO;
 }
@@ -1404,12 +1497,13 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
                ULIST_ITER_INIT(&root_uiter);
                while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
                        pr_debug("root %llu references leaf %llu, data list "
-                                "%#lx\n", root_node->val, ref_node->val,
-                                ref_node->aux);
-                       ret = iterate_leaf_refs(
-                               (struct extent_inode_elem *)ref_node->aux,
-                               root_node->val, extent_item_objectid,
-                               iterate, ctx);
+                                "%#llx\n", root_node->val, ref_node->val,
+                                (long long)ref_node->aux);
+                       ret = iterate_leaf_refs((struct extent_inode_elem *)
+                                               (uintptr_t)ref_node->aux,
+                                               root_node->val,
+                                               extent_item_objectid,
+                                               iterate, ctx);
                }
                ulist_free(roots);
                roots = NULL;
@@ -1432,15 +1526,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 {
        int ret;
        u64 extent_item_pos;
+       u64 flags = 0;
        struct btrfs_key found_key;
        int search_commit_root = path->search_commit_root;
 
-       ret = extent_from_logical(fs_info, logical, path,
-                                       &found_key);
+       ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
        btrfs_release_path(path);
        if (ret < 0)
                return ret;
-       if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+       if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
                return -EINVAL;
 
        extent_item_pos = logical - found_key.objectid;
@@ -1451,9 +1545,12 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
        return ret;
 }
 
-static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
-                               struct btrfs_path *path,
-                               iterate_irefs_t *iterate, void *ctx)
+typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
+                             struct extent_buffer *eb, void *ctx);
+
+static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
+                             struct btrfs_path *path,
+                             iterate_irefs_t *iterate, void *ctx)
 {
        int ret = 0;
        int slot;
@@ -1470,7 +1567,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
        while (!ret) {
                path->leave_spinning = 1;
                ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
-                                       &found_key);
+                                    &found_key);
                if (ret < 0)
                        break;
                if (ret) {
@@ -1498,7 +1595,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
                                 "tree %llu\n", cur,
                                 (unsigned long long)found_key.objectid,
                                 (unsigned long long)fs_root->objectid);
-                       ret = iterate(parent, iref, eb, ctx);
+                       ret = iterate(parent, name_len,
+                                     (unsigned long)(iref + 1), eb, ctx);
                        if (ret)
                                break;
                        len = sizeof(*iref) + name_len;
@@ -1513,12 +1611,98 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
        return ret;
 }
 
+static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
+                                struct btrfs_path *path,
+                                iterate_irefs_t *iterate, void *ctx)
+{
+       int ret;
+       int slot;
+       u64 offset = 0;
+       u64 parent;
+       int found = 0;
+       struct extent_buffer *eb;
+       struct btrfs_inode_extref *extref;
+       struct extent_buffer *leaf;
+       u32 item_size;
+       u32 cur_offset;
+       unsigned long ptr;
+
+       while (1) {
+               ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
+                                           &offset);
+               if (ret < 0)
+                       break;
+               if (ret) {
+                       ret = found ? 0 : -ENOENT;
+                       break;
+               }
+               ++found;
+
+               slot = path->slots[0];
+               eb = path->nodes[0];
+               /* make sure we can use eb after releasing the path */
+               atomic_inc(&eb->refs);
+
+               btrfs_tree_read_lock(eb);
+               btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+               btrfs_release_path(path);
+
+               leaf = path->nodes[0];
+               item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+               ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+               cur_offset = 0;
+
+               while (cur_offset < item_size) {
+                       u32 name_len;
+
+                       extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
+                       parent = btrfs_inode_extref_parent(eb, extref);
+                       name_len = btrfs_inode_extref_name_len(eb, extref);
+                       ret = iterate(parent, name_len,
+                                     (unsigned long)&extref->name, eb, ctx);
+                       if (ret)
+                               break;
+
+                       cur_offset += btrfs_inode_extref_name_len(leaf, extref);
+                       cur_offset += sizeof(*extref);
+               }
+               btrfs_tree_read_unlock_blocking(eb);
+               free_extent_buffer(eb);
+
+               offset++;
+       }
+
+       btrfs_release_path(path);
+
+       return ret;
+}
+
+static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
+                        struct btrfs_path *path, iterate_irefs_t *iterate,
+                        void *ctx)
+{
+       int ret;
+       int found_refs = 0;
+
+       ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
+       if (!ret)
+               ++found_refs;
+       else if (ret != -ENOENT)
+               return ret;
+
+       ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
+       if (ret == -ENOENT && found_refs)
+               return 0;
+
+       return ret;
+}
+
 /*
  * returns 0 if the path could be dumped (probably truncated)
  * returns <0 in case of an error
  */
-static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
-                               struct extent_buffer *eb, void *ctx)
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
+                        struct extent_buffer *eb, void *ctx)
 {
        struct inode_fs_paths *ipath = ctx;
        char *fspath;
@@ -1531,20 +1715,17 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
                                        ipath->fspath->bytes_left - s_ptr : 0;
 
        fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
-       fspath = btrfs_iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
-                               inum, fspath_min, bytes_left);
+       fspath = ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
+                            name_off, eb, inum, fspath_min,
+                            bytes_left);
        if (IS_ERR(fspath))
                return PTR_ERR(fspath);
 
        if (fspath > fspath_min) {
-               pr_debug("path resolved: %s\n", fspath);
                ipath->fspath->val[i] = (u64)(unsigned long)fspath;
                ++ipath->fspath->elem_cnt;
                ipath->fspath->bytes_left = fspath - fspath_min;
        } else {
-               pr_debug("missed path, not enough space. missing bytes: %lu, "
-                        "constructed so far: %s\n",
-                        (unsigned long)(fspath_min - fspath), fspath_min);
                ++ipath->fspath->elem_missed;
                ipath->fspath->bytes_missing += fspath_min - fspath;
                ipath->fspath->bytes_left = 0;
@@ -1566,7 +1747,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
 int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
 {
        return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
-                               inode_to_path, ipath);
+                            inode_to_path, ipath);
 }
 
 struct btrfs_data_container *init_data_container(u32 total_bytes)
@@ -1575,7 +1756,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
        size_t alloc_bytes;
 
        alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
-       data = kmalloc(alloc_bytes, GFP_NOFS);
+       data = vmalloc(alloc_bytes);
        if (!data)
                return ERR_PTR(-ENOMEM);
 
@@ -1626,6 +1807,6 @@ void free_ipath(struct inode_fs_paths *ipath)
 {
        if (!ipath)
                return;
-       kfree(ipath->fspath);
+       vfree(ipath->fspath);
        kfree(ipath);
 }
index 032f4dc7eab82f9ba5e17497e0062d5556451698..e75533043a5ffbab21ff133877c352b743ef6592 100644 (file)
@@ -33,14 +33,13 @@ struct inode_fs_paths {
 
 typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
                void *ctx);
-typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
-                               struct extent_buffer *eb, void *ctx);
 
 int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
                        struct btrfs_path *path);
 
 int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
-                       struct btrfs_path *path, struct btrfs_key *found_key);
+                       struct btrfs_path *path, struct btrfs_key *found_key,
+                       u64 *flags);
 
 int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
                                struct btrfs_extent_item *ei, u32 item_size,
@@ -69,4 +68,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
                                        struct btrfs_path *path);
 void free_ipath(struct inode_fs_paths *ipath);
 
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+                         u64 start_off, struct btrfs_path *path,
+                         struct btrfs_inode_extref **ret_extref,
+                         u64 *found_off);
+
 #endif
index 5b2ad6bc4fe7f18ca8038052d26db5afb2b6c6c6..ed8ca7ca5eff2d2ce0529f42d32fb394b0331e89 100644 (file)
@@ -38,6 +38,7 @@
 #define BTRFS_INODE_DELALLOC_META_RESERVED     4
 #define BTRFS_INODE_HAS_ORPHAN_ITEM            5
 #define BTRFS_INODE_HAS_ASYNC_EXTENT           6
+#define BTRFS_INODE_NEEDS_FULL_SYNC            7
 
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -143,6 +144,9 @@ struct btrfs_inode {
        /* flags field from the on disk inode */
        u32 flags;
 
+       /* a local copy of root's last_log_commit */
+       unsigned long last_log_commit;
+
        /*
         * Counters to keep track of the number of extent item's we may use due
         * to delalloc and such.  outstanding_extents is the number of extent
@@ -202,15 +206,10 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
 
 static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
 {
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       int ret = 0;
-
-       mutex_lock(&root->log_mutex);
        if (BTRFS_I(inode)->logged_trans == generation &&
-           BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
-               ret = 1;
-       mutex_unlock(&root->log_mutex);
-       return ret;
+           BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit)
+               return 1;
+       return 0;
 }
 
 #endif
index 9197e2e33407d26f67a91196ae95328840e3f6ce..5a3e45db642a6b1c64998ba523be143d382d396e 100644 (file)
@@ -37,8 +37,9 @@
  *        the file system was mounted, (i.e., they have been
  *        referenced by the super block) or they have been
  *        written since then and the write completion callback
- *        was called and a FLUSH request to the device where
- *        these blocks are located was received and completed.
+ *        was called and no write error was indicated and a
+ *        FLUSH request to the device where these blocks are
+ *        located was received and completed.
  *    2b. All referenced blocks need to have a generation
  *        number which is equal to the parent's number.
  *
@@ -2601,6 +2602,17 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
                               (unsigned long long)l->block_ref_to->dev_bytenr,
                               l->block_ref_to->mirror_num);
                        ret = -1;
+               } else if (l->block_ref_to->iodone_w_error) {
+                       printk(KERN_INFO "btrfs: attempt to write superblock"
+                              " which references block %c @%llu (%s/%llu/%d)"
+                              " which has write error!\n",
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num);
+                       ret = -1;
                } else if (l->parent_generation !=
                           l->block_ref_to->generation &&
                           BTRFSIC_GENERATION_UNKNOWN !=
index 43d1c5a3a030888544d8dd8f7b36239386552d08..c6467aa88bee24fb3f4fe401306aed57442a0a59 100644 (file)
@@ -577,6 +577,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        u64 em_start;
        struct extent_map *em;
        int ret = -ENOMEM;
+       int faili = 0;
        u32 *sums;
 
        tree = &BTRFS_I(inode)->io_tree;
@@ -626,9 +627,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        for (pg_index = 0; pg_index < nr_pages; pg_index++) {
                cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
                                                              __GFP_HIGHMEM);
-               if (!cb->compressed_pages[pg_index])
+               if (!cb->compressed_pages[pg_index]) {
+                       faili = pg_index - 1;
+                       ret = -ENOMEM;
                        goto fail2;
+               }
        }
+       faili = nr_pages - 1;
        cb->nr_pages = nr_pages;
 
        add_ra_bio_pages(inode, em_start + em_len, cb);
@@ -713,8 +718,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        return 0;
 
 fail2:
-       for (pg_index = 0; pg_index < nr_pages; pg_index++)
-               free_page((unsigned long)cb->compressed_pages[pg_index]);
+       while (faili >= 0) {
+               __free_page(cb->compressed_pages[faili]);
+               faili--;
+       }
 
        kfree(cb->compressed_pages);
 fail1:
index 6d183f60d63a0521e8c4461e4d5fd705163d17d1..b334362110003165a72b63433b192f2b481c3b01 100644 (file)
@@ -4401,149 +4401,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
        }
 }
 
-/*
- * Given a key and some data, insert items into the tree.
- * This does all the path init required, making room in the tree if needed.
- * Returns the number of keys that were inserted.
- */
-int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root,
-                           struct btrfs_path *path,
-                           struct btrfs_key *cpu_key, u32 *data_size,
-                           int nr)
-{
-       struct extent_buffer *leaf;
-       struct btrfs_item *item;
-       int ret = 0;
-       int slot;
-       int i;
-       u32 nritems;
-       u32 total_data = 0;
-       u32 total_size = 0;
-       unsigned int data_end;
-       struct btrfs_disk_key disk_key;
-       struct btrfs_key found_key;
-       struct btrfs_map_token token;
-
-       btrfs_init_map_token(&token);
-
-       for (i = 0; i < nr; i++) {
-               if (total_size + data_size[i] + sizeof(struct btrfs_item) >
-                   BTRFS_LEAF_DATA_SIZE(root)) {
-                       break;
-                       nr = i;
-               }
-               total_data += data_size[i];
-               total_size += data_size[i] + sizeof(struct btrfs_item);
-       }
-       BUG_ON(nr == 0);
-
-       ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
-       if (ret == 0)
-               return -EEXIST;
-       if (ret < 0)
-               goto out;
-
-       leaf = path->nodes[0];
-
-       nritems = btrfs_header_nritems(leaf);
-       data_end = leaf_data_end(root, leaf);
-
-       if (btrfs_leaf_free_space(root, leaf) < total_size) {
-               for (i = nr; i >= 0; i--) {
-                       total_data -= data_size[i];
-                       total_size -= data_size[i] + sizeof(struct btrfs_item);
-                       if (total_size < btrfs_leaf_free_space(root, leaf))
-                               break;
-               }
-               nr = i;
-       }
-
-       slot = path->slots[0];
-       BUG_ON(slot < 0);
-
-       if (slot != nritems) {
-               unsigned int old_data = btrfs_item_end_nr(leaf, slot);
-
-               item = btrfs_item_nr(leaf, slot);
-               btrfs_item_key_to_cpu(leaf, &found_key, slot);
-
-               /* figure out how many keys we can insert in here */
-               total_data = data_size[0];
-               for (i = 1; i < nr; i++) {
-                       if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
-                               break;
-                       total_data += data_size[i];
-               }
-               nr = i;
-
-               if (old_data < data_end) {
-                       btrfs_print_leaf(root, leaf);
-                       printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
-                              slot, old_data, data_end);
-                       BUG_ON(1);
-               }
-               /*
-                * item0..itemN ... dataN.offset..dataN.size .. data0.size
-                */
-               /* first correct the data pointers */
-               for (i = slot; i < nritems; i++) {
-                       u32 ioff;
-
-                       item = btrfs_item_nr(leaf, i);
-                       ioff = btrfs_token_item_offset(leaf, item, &token);
-                       btrfs_set_token_item_offset(leaf, item,
-                                                   ioff - total_data, &token);
-               }
-               /* shift the items */
-               memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
-                             btrfs_item_nr_offset(slot),
-                             (nritems - slot) * sizeof(struct btrfs_item));
-
-               /* shift the data */
-               memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
-                             data_end - total_data, btrfs_leaf_data(leaf) +
-                             data_end, old_data - data_end);
-               data_end = old_data;
-       } else {
-               /*
-                * this sucks but it has to be done, if we are inserting at
-                * the end of the leaf only insert 1 of the items, since we
-                * have no way of knowing whats on the next leaf and we'd have
-                * to drop our current locks to figure it out
-                */
-               nr = 1;
-       }
-
-       /* setup the item for the new data */
-       for (i = 0; i < nr; i++) {
-               btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
-               btrfs_set_item_key(leaf, &disk_key, slot + i);
-               item = btrfs_item_nr(leaf, slot + i);
-               btrfs_set_token_item_offset(leaf, item,
-                                           data_end - data_size[i], &token);
-               data_end -= data_size[i];
-               btrfs_set_token_item_size(leaf, item, data_size[i], &token);
-       }
-       btrfs_set_header_nritems(leaf, nritems + nr);
-       btrfs_mark_buffer_dirty(leaf);
-
-       ret = 0;
-       if (slot == 0) {
-               btrfs_cpu_key_to_disk(&disk_key, cpu_key);
-               fixup_low_keys(trans, root, path, &disk_key, 1);
-       }
-
-       if (btrfs_leaf_free_space(root, leaf) < 0) {
-               btrfs_print_leaf(root, leaf);
-               BUG();
-       }
-out:
-       if (!ret)
-               ret = nr;
-       return ret;
-}
-
 /*
  * this is a helper for btrfs_insert_empty_items, the main goal here is
  * to save stack depth by doing the bulk of the work in a function
@@ -5073,6 +4930,7 @@ static void tree_move_down(struct btrfs_root *root,
                           struct btrfs_path *path,
                           int *level, int root_level)
 {
+       BUG_ON(*level == 0);
        path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
                                        path->slots[*level]);
        path->slots[*level - 1] = 0;
@@ -5089,7 +4947,7 @@ static int tree_move_next_or_upnext(struct btrfs_root *root,
 
        path->slots[*level]++;
 
-       while (path->slots[*level] == nritems) {
+       while (path->slots[*level] >= nritems) {
                if (*level == root_level)
                        return -1;
 
@@ -5433,9 +5291,11 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
                                        goto out;
                                advance_right = ADVANCE;
                        } else {
+                               WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
                                ret = tree_compare_item(left_root, left_path,
                                                right_path, tmp_buf);
                                if (ret) {
+                                       WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
                                        ret = changed_cb(left_root, right_root,
                                                left_path, right_path,
                                                &left_key,
index 9821b672f5a254fb02c4b17c2f3db3798c538c45..926c9ffc66d93324d155481c4ecba13d27fa3fec 100644 (file)
@@ -154,6 +154,13 @@ struct btrfs_ordered_sum;
  */
 #define BTRFS_NAME_LEN 255
 
+/*
+ * Theoretical limit is larger, but we keep this down to a sane
+ * value. That should limit greatly the possibility of collisions on
+ * inode ref items.
+ */
+#define BTRFS_LINK_MAX 65535U
+
 /* 32 bytes in various csum fields */
 #define BTRFS_CSUM_SIZE 32
 
@@ -489,6 +496,8 @@ struct btrfs_super_block {
  */
 #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA    (1ULL << 5)
 
+#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF   (1ULL << 6)
+
 #define BTRFS_FEATURE_COMPAT_SUPP              0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP           0ULL
 #define BTRFS_FEATURE_INCOMPAT_SUPP                    \
@@ -496,7 +505,8 @@ struct btrfs_super_block {
         BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |        \
         BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |          \
         BTRFS_FEATURE_INCOMPAT_BIG_METADATA |          \
-        BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
+        BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |          \
+        BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
 
 /*
  * A leaf is full of items. offset and size tell us where to find
@@ -643,6 +653,14 @@ struct btrfs_inode_ref {
        /* name goes here */
 } __attribute__ ((__packed__));
 
+struct btrfs_inode_extref {
+       __le64 parent_objectid;
+       __le64 index;
+       __le16 name_len;
+       __u8   name[0];
+       /* name goes here */
+} __attribute__ ((__packed__));
+
 struct btrfs_timespec {
        __le64 sec;
        __le32 nsec;
@@ -1028,12 +1046,22 @@ struct btrfs_space_info {
        wait_queue_head_t wait;
 };
 
+#define        BTRFS_BLOCK_RSV_GLOBAL          1
+#define        BTRFS_BLOCK_RSV_DELALLOC        2
+#define        BTRFS_BLOCK_RSV_TRANS           3
+#define        BTRFS_BLOCK_RSV_CHUNK           4
+#define        BTRFS_BLOCK_RSV_DELOPS          5
+#define        BTRFS_BLOCK_RSV_EMPTY           6
+#define        BTRFS_BLOCK_RSV_TEMP            7
+
 struct btrfs_block_rsv {
        u64 size;
        u64 reserved;
        struct btrfs_space_info *space_info;
        spinlock_t lock;
-       unsigned int full;
+       unsigned short full;
+       unsigned short type;
+       unsigned short failfast;
 };
 
 /*
@@ -1127,6 +1155,9 @@ struct btrfs_block_group_cache {
         * Today it will only have one thing on it, but that may change
         */
        struct list_head cluster_list;
+
+       /* For delayed block group creation */
+       struct list_head new_bg_list;
 };
 
 /* delayed seq elem */
@@ -1240,7 +1271,6 @@ struct btrfs_fs_info {
        struct mutex reloc_mutex;
 
        struct list_head trans_list;
-       struct list_head hashers;
        struct list_head dead_roots;
        struct list_head caching_block_groups;
 
@@ -1366,9 +1396,6 @@ struct btrfs_fs_info {
        struct rb_root defrag_inodes;
        atomic_t defrag_running;
 
-       spinlock_t ref_cache_lock;
-       u64 total_ref_cache_size;
-
        /*
         * these three are in extended format (availability of single
         * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1441,6 +1468,8 @@ struct btrfs_fs_info {
 
        /* next backup root to be overwritten */
        int backup_root_index;
+
+       int num_tolerated_disk_barrier_failures;
 };
 
 /*
@@ -1481,9 +1510,9 @@ struct btrfs_root {
        wait_queue_head_t log_commit_wait[2];
        atomic_t log_writers;
        atomic_t log_commit[2];
+       atomic_t log_batch;
        unsigned long log_transid;
        unsigned long last_log_commit;
-       unsigned long log_batch;
        pid_t log_start_pid;
        bool log_multiple_pids;
 
@@ -1592,6 +1621,7 @@ struct btrfs_ioctl_defrag_range_args {
  */
 #define BTRFS_INODE_ITEM_KEY           1
 #define BTRFS_INODE_REF_KEY            12
+#define BTRFS_INODE_EXTREF_KEY         13
 #define BTRFS_XATTR_ITEM_KEY           24
 #define BTRFS_ORPHAN_ITEM_KEY          48
 /* reserve 2-15 close to the inode for later flexibility */
@@ -1978,6 +2008,13 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags,
 BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
 BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
 
+/* struct btrfs_inode_extref */
+BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
+                  parent_objectid, 64);
+BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
+                  name_len, 16);
+BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
+
 /* struct btrfs_inode_item */
 BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
 BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
@@ -2858,6 +2895,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
@@ -2874,8 +2913,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+                                             unsigned short type);
 void btrfs_free_block_rsv(struct btrfs_root *root,
                          struct btrfs_block_rsv *rsv);
 int btrfs_block_rsv_add(struct btrfs_root *root,
@@ -3172,12 +3212,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           const char *name, int name_len,
                           u64 inode_objectid, u64 ref_objectid, u64 *index);
-struct btrfs_inode_ref *
-btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root,
-                       struct btrfs_path *path,
-                       const char *name, int name_len,
-                       u64 inode_objectid, u64 ref_objectid, int mod);
+int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct btrfs_path *path,
+                             const char *name, int name_len,
+                             u64 inode_objectid, u64 ref_objectid, int mod,
+                             u64 *ret_index);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid);
@@ -3185,6 +3225,19 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
                       *root, struct btrfs_path *path,
                       struct btrfs_key *location, int mod);
 
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         struct btrfs_path *path,
+                         const char *name, int name_len,
+                         u64 inode_objectid, u64 ref_objectid, int ins_len,
+                         int cow);
+
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
+                                  u64 ref_objectid, const char *name,
+                                  int name_len,
+                                  struct btrfs_inode_extref **extref_ret);
+
 /* file-item.c */
 int btrfs_del_csums(struct btrfs_trans_handle *trans,
                    struct btrfs_root *root, u64 bytenr, u64 len);
@@ -3249,6 +3302,8 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct inode *dir, u64 objectid,
                        const char *name, int name_len);
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+                       int front);
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               struct inode *inode, u64 new_size,
@@ -3308,16 +3363,27 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 int btrfs_defrag_file(struct inode *inode, struct file *file,
                      struct btrfs_ioctl_defrag_range_args *range,
                      u64 newer_than, unsigned long max_pages);
+void btrfs_get_block_group_info(struct list_head *groups_list,
+                               struct btrfs_ioctl_space_info *space);
+
 /* file.c */
 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
                           struct inode *inode);
 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
-                           int skip_pinned);
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+                            int skip_pinned);
+int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
+                              u64 start, u64 end, int skip_pinned,
+                              int modified);
 extern const struct file_operations btrfs_file_operations;
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
-                      u64 start, u64 end, u64 *hint_byte, int drop_cache);
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root, struct inode *inode,
+                        struct btrfs_path *path, u64 start, u64 end,
+                        u64 *drop_end, int drop_cache);
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root, struct inode *inode, u64 start,
+                      u64 end, int drop_cache);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
                              struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
@@ -3378,6 +3444,11 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
        }
 }
 
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact line number is reported.
+ */
+
 #define btrfs_abort_transaction(trans, root, errno)            \
 do {                                                           \
        __btrfs_abort_transaction(trans, root, __func__,        \
index 52c85e2b95d0f7efa9cbd105bab093eca90e1b9c..478f66bdc57b958445365baf739122e4eec263af 100644 (file)
@@ -29,7 +29,7 @@ static struct kmem_cache *delayed_node_cache;
 
 int __init btrfs_delayed_inode_init(void)
 {
-       delayed_node_cache = kmem_cache_create("delayed_node",
+       delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
                                        sizeof(struct btrfs_delayed_node),
                                        0,
                                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
@@ -650,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata(
         * we're accounted for.
         */
        if (!src_rsv || (!trans->bytes_reserved &&
-           src_rsv != &root->fs_info->delalloc_block_rsv)) {
+                        src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
                ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
                /*
                 * Since we're under a transaction reserve_metadata_bytes could
@@ -668,7 +668,7 @@ static int btrfs_delayed_inode_reserve_metadata(
                                                      num_bytes, 1);
                }
                return ret;
-       } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
+       } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
                spin_lock(&BTRFS_I(inode)->lock);
                if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
                                       &BTRFS_I(inode)->runtime_flags)) {
index 22e98e04c2eabbc0b4baeb2618033657a9344fb9..7cda51995c1e589eaf36fe048518bbbe0bd21109 100644 (file)
 #include "check-integrity.h"
 #include "rcu-string.h"
 
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
+
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
@@ -217,26 +221,16 @@ static struct extent_map *btree_get_extent(struct inode *inode,
        write_lock(&em_tree->lock);
        ret = add_extent_mapping(em_tree, em);
        if (ret == -EEXIST) {
-               u64 failed_start = em->start;
-               u64 failed_len = em->len;
-
                free_extent_map(em);
                em = lookup_extent_mapping(em_tree, start, len);
-               if (em) {
-                       ret = 0;
-               } else {
-                       em = lookup_extent_mapping(em_tree, failed_start,
-                                                  failed_len);
-                       ret = -EIO;
-               }
+               if (!em)
+                       em = ERR_PTR(-EIO);
        } else if (ret) {
                free_extent_map(em);
-               em = NULL;
+               em = ERR_PTR(ret);
        }
        write_unlock(&em_tree->lock);
 
-       if (ret)
-               em = ERR_PTR(ret);
 out:
        return em;
 }
@@ -439,10 +433,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
                WARN_ON(1);
                return 0;
        }
-       if (eb->pages[0] != page) {
-               WARN_ON(1);
-               return 0;
-       }
        if (!PageUptodate(page)) {
                WARN_ON(1);
                return 0;
@@ -869,10 +859,22 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
        return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
 }
 
+static int check_async_write(struct inode *inode, unsigned long bio_flags)
+{
+       if (bio_flags & EXTENT_BIO_TREE_LOG)
+               return 0;
+#ifdef CONFIG_X86
+       if (cpu_has_xmm4_2)
+               return 0;
+#endif
+       return 1;
+}
+
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                                 int mirror_num, unsigned long bio_flags,
                                 u64 bio_offset)
 {
+       int async = check_async_write(inode, bio_flags);
        int ret;
 
        if (!(rw & REQ_WRITE)) {
@@ -887,6 +889,12 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                        return ret;
                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
                                     mirror_num, 0);
+       } else if (!async) {
+               ret = btree_csum_one_bio(bio);
+               if (ret)
+                       return ret;
+               return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+                                    mirror_num, 0);
        }
 
        /*
@@ -1168,8 +1176,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        atomic_set(&root->log_commit[0], 0);
        atomic_set(&root->log_commit[1], 0);
        atomic_set(&root->log_writers, 0);
+       atomic_set(&root->log_batch, 0);
        atomic_set(&root->orphan_inodes, 0);
-       root->log_batch = 0;
        root->log_transid = 0;
        root->last_log_commit = 0;
        extent_io_tree_init(&root->dirty_log_pages,
@@ -1667,9 +1675,10 @@ static int transaction_kthread(void *arg)
                spin_unlock(&root->fs_info->trans_lock);
 
                /* If the file system is aborted, this will always fail. */
-               trans = btrfs_join_transaction(root);
+               trans = btrfs_attach_transaction(root);
                if (IS_ERR(trans)) {
-                       cannot_commit = true;
+                       if (PTR_ERR(trans) != -ENOENT)
+                               cannot_commit = true;
                        goto sleep;
                }
                if (transid == trans->transid) {
@@ -1994,13 +2003,11 @@ int open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->trans_list);
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->delayed_iputs);
-       INIT_LIST_HEAD(&fs_info->hashers);
        INIT_LIST_HEAD(&fs_info->delalloc_inodes);
        INIT_LIST_HEAD(&fs_info->ordered_operations);
        INIT_LIST_HEAD(&fs_info->caching_block_groups);
        spin_lock_init(&fs_info->delalloc_lock);
        spin_lock_init(&fs_info->trans_lock);
-       spin_lock_init(&fs_info->ref_cache_lock);
        spin_lock_init(&fs_info->fs_roots_radix_lock);
        spin_lock_init(&fs_info->delayed_iput_lock);
        spin_lock_init(&fs_info->defrag_inodes_lock);
@@ -2014,12 +2021,15 @@ int open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->space_info);
        INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
        btrfs_mapping_init(&fs_info->mapping_tree);
-       btrfs_init_block_rsv(&fs_info->global_block_rsv);
-       btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
-       btrfs_init_block_rsv(&fs_info->trans_block_rsv);
-       btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
-       btrfs_init_block_rsv(&fs_info->empty_block_rsv);
-       btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
+       btrfs_init_block_rsv(&fs_info->global_block_rsv,
+                            BTRFS_BLOCK_RSV_GLOBAL);
+       btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
+                            BTRFS_BLOCK_RSV_DELALLOC);
+       btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
+       btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+       btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
+       btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
+                            BTRFS_BLOCK_RSV_DELOPS);
        atomic_set(&fs_info->nr_async_submits, 0);
        atomic_set(&fs_info->async_delalloc_pages, 0);
        atomic_set(&fs_info->async_submit_draining, 0);
@@ -2491,6 +2501,8 @@ retry_root_backup:
                printk(KERN_ERR "Failed to read block groups: %d\n", ret);
                goto fail_block_groups;
        }
+       fs_info->num_tolerated_disk_barrier_failures =
+               btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
 
        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
                                               "btrfs-cleaner");
@@ -2874,12 +2886,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
                        printk_in_rcu("btrfs: disabling barriers on dev %s\n",
                                      rcu_str_deref(device->name));
                        device->nobarriers = 1;
-               }
-               if (!bio_flagged(bio, BIO_UPTODATE)) {
+               } else if (!bio_flagged(bio, BIO_UPTODATE)) {
                        ret = -EIO;
-                       if (!bio_flagged(bio, BIO_EOPNOTSUPP))
-                               btrfs_dev_stat_inc_and_print(device,
-                                       BTRFS_DEV_STAT_FLUSH_ERRS);
+                       btrfs_dev_stat_inc_and_print(device,
+                               BTRFS_DEV_STAT_FLUSH_ERRS);
                }
 
                /* drop the reference from the wait == 0 run */
@@ -2918,14 +2928,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 {
        struct list_head *head;
        struct btrfs_device *dev;
-       int errors = 0;
+       int errors_send = 0;
+       int errors_wait = 0;
        int ret;
 
        /* send down all the barriers */
        head = &info->fs_devices->devices;
        list_for_each_entry_rcu(dev, head, dev_list) {
                if (!dev->bdev) {
-                       errors++;
+                       errors_send++;
                        continue;
                }
                if (!dev->in_fs_metadata || !dev->writeable)
@@ -2933,13 +2944,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 
                ret = write_dev_flush(dev, 0);
                if (ret)
-                       errors++;
+                       errors_send++;
        }
 
        /* wait for all the barriers */
        list_for_each_entry_rcu(dev, head, dev_list) {
                if (!dev->bdev) {
-                       errors++;
+                       errors_wait++;
                        continue;
                }
                if (!dev->in_fs_metadata || !dev->writeable)
@@ -2947,13 +2958,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 
                ret = write_dev_flush(dev, 1);
                if (ret)
-                       errors++;
+                       errors_wait++;
        }
-       if (errors)
+       if (errors_send > info->num_tolerated_disk_barrier_failures ||
+           errors_wait > info->num_tolerated_disk_barrier_failures)
                return -EIO;
        return 0;
 }
 
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+       struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_ioctl_space_info space;
+       struct btrfs_space_info *sinfo;
+       u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+                      BTRFS_BLOCK_GROUP_SYSTEM,
+                      BTRFS_BLOCK_GROUP_METADATA,
+                      BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+       int num_types = 4;
+       int i;
+       int c;
+       int num_tolerated_disk_barrier_failures =
+               (int)fs_info->fs_devices->num_devices;
+
+       for (i = 0; i < num_types; i++) {
+               struct btrfs_space_info *tmp;
+
+               sinfo = NULL;
+               rcu_read_lock();
+               list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
+                       if (tmp->flags == types[i]) {
+                               sinfo = tmp;
+                               break;
+                       }
+               }
+               rcu_read_unlock();
+
+               if (!sinfo)
+                       continue;
+
+               down_read(&sinfo->groups_sem);
+               for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+                       if (!list_empty(&sinfo->block_groups[c])) {
+                               u64 flags;
+
+                               btrfs_get_block_group_info(
+                                       &sinfo->block_groups[c], &space);
+                               if (space.total_bytes == 0 ||
+                                   space.used_bytes == 0)
+                                       continue;
+                               flags = space.flags;
+                               /*
+                                * return
+                                * 0: if dup, single or RAID0 is configured for
+                                *    any of metadata, system or data, else
+                                * 1: if RAID5 is configured, or if RAID1 or
+                                *    RAID10 is configured and only two mirrors
+                                *    are used, else
+                                * 2: if RAID6 is configured, else
+                                * num_mirrors - 1: if RAID1 or RAID10 is
+                                *                  configured and more than
+                                *                  2 mirrors are used.
+                                */
+                               if (num_tolerated_disk_barrier_failures > 0 &&
+                                   ((flags & (BTRFS_BLOCK_GROUP_DUP |
+                                              BTRFS_BLOCK_GROUP_RAID0)) ||
+                                    ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
+                                     == 0)))
+                                       num_tolerated_disk_barrier_failures = 0;
+                               else if (num_tolerated_disk_barrier_failures > 1
+                                        &&
+                                        (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                                                  BTRFS_BLOCK_GROUP_RAID10)))
+                                       num_tolerated_disk_barrier_failures = 1;
+                       }
+               }
+               up_read(&sinfo->groups_sem);
+       }
+
+       return num_tolerated_disk_barrier_failures;
+}
+
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
        struct list_head *head;
@@ -2976,8 +3061,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        head = &root->fs_info->fs_devices->devices;
 
-       if (do_barriers)
-               barrier_all_devices(root->fs_info);
+       if (do_barriers) {
+               ret = barrier_all_devices(root->fs_info);
+               if (ret) {
+                       mutex_unlock(
+                               &root->fs_info->fs_devices->device_list_mutex);
+                       btrfs_error(root->fs_info, ret,
+                                   "errors while submitting device barriers.");
+                       return ret;
+               }
+       }
 
        list_for_each_entry_rcu(dev, head, dev_list) {
                if (!dev->bdev) {
@@ -3211,10 +3304,6 @@ int close_ctree(struct btrfs_root *root)
                printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
                       (unsigned long long)fs_info->delalloc_bytes);
        }
-       if (fs_info->total_ref_cache_size) {
-               printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
-                      (unsigned long long)fs_info->total_ref_cache_size);
-       }
 
        free_extent_buffer(fs_info->extent_root->node);
        free_extent_buffer(fs_info->extent_root->commit_root);
@@ -3360,52 +3449,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
        return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 }
 
-int btree_lock_page_hook(struct page *page, void *data,
-                               void (*flush_fn)(void *))
-{
-       struct inode *inode = page->mapping->host;
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct extent_buffer *eb;
-
-       /*
-        * We culled this eb but the page is still hanging out on the mapping,
-        * carry on.
-        */
-       if (!PagePrivate(page))
-               goto out;
-
-       eb = (struct extent_buffer *)page->private;
-       if (!eb) {
-               WARN_ON(1);
-               goto out;
-       }
-       if (page != eb->pages[0])
-               goto out;
-
-       if (!btrfs_try_tree_write_lock(eb)) {
-               flush_fn(data);
-               btrfs_tree_lock(eb);
-       }
-       btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-
-       if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-               spin_lock(&root->fs_info->delalloc_lock);
-               if (root->fs_info->dirty_metadata_bytes >= eb->len)
-                       root->fs_info->dirty_metadata_bytes -= eb->len;
-               else
-                       WARN_ON(1);
-               spin_unlock(&root->fs_info->delalloc_lock);
-       }
-
-       btrfs_tree_unlock(eb);
-out:
-       if (!trylock_page(page)) {
-               flush_fn(data);
-               lock_page(page);
-       }
-       return 0;
-}
-
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
                              int read_only)
 {
@@ -3608,7 +3651,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
 
        while (1) {
                ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-                                           mark);
+                                           mark, NULL);
                if (ret)
                        break;
 
@@ -3663,7 +3706,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
 again:
        while (1) {
                ret = find_first_extent_bit(unpin, 0, &start, &end,
-                                           EXTENT_DIRTY);
+                                           EXTENT_DIRTY, NULL);
                if (ret)
                        break;
 
@@ -3800,7 +3843,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
 }
 
 static struct extent_io_ops btree_extent_io_ops = {
-       .write_cache_pages_lock_hook = btree_lock_page_hook,
        .readpage_end_io_hook = btree_readpage_end_io_hook,
        .readpage_io_failed_hook = btree_io_failed_hook,
        .submit_bio_hook = btree_submit_bio_hook,
index c5b00a735fefac258b7914223139d902efacbf99..2025a9132c16119c5795b2614b7d114ada047643 100644 (file)
@@ -95,6 +95,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
                                     u64 objectid);
 int btree_lock_page_hook(struct page *page, void *data,
                                void (*flush_fn)(void *));
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+       struct btrfs_fs_info *fs_info);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
index ba58024d40d3eaf486c50d96cd116c3b828b9c75..3d3e2c17d8d12234a4a5fdcf2fc1af0eb717b6dd 100644 (file)
@@ -94,8 +94,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                                     u64 flags, struct btrfs_disk_key *key,
                                     int level, struct btrfs_key *ins);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *extent_root, u64 alloc_bytes,
-                         u64 flags, int force);
+                         struct btrfs_root *extent_root, u64 flags,
+                         int force);
 static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -312,7 +312,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
        while (start < end) {
                ret = find_first_extent_bit(info->pinned_extents, start,
                                            &extent_start, &extent_end,
-                                           EXTENT_DIRTY | EXTENT_UPTODATE);
+                                           EXTENT_DIRTY | EXTENT_UPTODATE,
+                                           NULL);
                if (ret)
                        break;
 
@@ -2361,10 +2362,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                }
 
 next:
-               do_chunk_alloc(trans, fs_info->extent_root,
-                              2 * 1024 * 1024,
-                              btrfs_get_alloc_profile(root, 0),
-                              CHUNK_ALLOC_NO_FORCE);
                cond_resched();
                spin_lock(&delayed_refs->lock);
        }
@@ -2478,10 +2475,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        if (root == root->fs_info->extent_root)
                root = root->fs_info->tree_root;
 
-       do_chunk_alloc(trans, root->fs_info->extent_root,
-                      2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
-                      CHUNK_ALLOC_NO_FORCE);
-
        btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
 
        delayed_refs = &trans->transaction->delayed_refs;
@@ -2551,6 +2544,12 @@ again:
        }
 
        if (run_all) {
+               if (!list_empty(&trans->new_bgs)) {
+                       spin_unlock(&delayed_refs->lock);
+                       btrfs_create_pending_block_groups(trans, root);
+                       spin_lock(&delayed_refs->lock);
+               }
+
                node = rb_first(&delayed_refs->root);
                if (!node)
                        goto out;
@@ -3406,7 +3405,6 @@ alloc:
                                return PTR_ERR(trans);
 
                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                            bytes + 2 * 1024 * 1024,
                                             alloc_target,
                                             CHUNK_ALLOC_NO_FORCE);
                        btrfs_end_transaction(trans, root);
@@ -3488,8 +3486,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
 }
 
 static int should_alloc_chunk(struct btrfs_root *root,
-                             struct btrfs_space_info *sinfo, u64 alloc_bytes,
-                             int force)
+                             struct btrfs_space_info *sinfo, int force)
 {
        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
@@ -3504,7 +3501,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
         * and purposes it's used space.  Don't worry about locking the
         * global_rsv, it doesn't change except when the transaction commits.
         */
-       num_allocated += global_rsv->size;
+       if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
+               num_allocated += global_rsv->size;
 
        /*
         * in limited mode, we want to have some free space up to
@@ -3518,15 +3516,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
                if (num_bytes - num_allocated < thresh)
                        return 1;
        }
-       thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 
-       /* 256MB or 2% of the FS */
-       thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
-       /* system chunks need a much small threshold */
-       if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
-               thresh = 32 * 1024 * 1024;
-
-       if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
+       if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
                return 0;
        return 1;
 }
@@ -3576,8 +3567,7 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
 }
 
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *extent_root, u64 alloc_bytes,
-                         u64 flags, int force)
+                         struct btrfs_root *extent_root, u64 flags, int force)
 {
        struct btrfs_space_info *space_info;
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
@@ -3601,7 +3591,7 @@ again:
                return 0;
        }
 
-       if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
+       if (!should_alloc_chunk(extent_root, space_info, force)) {
                spin_unlock(&space_info->lock);
                return 0;
        } else if (space_info->chunk_alloc) {
@@ -3669,6 +3659,46 @@ out:
        return ret;
 }
 
+static int can_overcommit(struct btrfs_root *root,
+                         struct btrfs_space_info *space_info, u64 bytes,
+                         int flush)
+{
+       u64 profile = btrfs_get_alloc_profile(root, 0);
+       u64 avail;
+       u64 used;
+
+       used = space_info->bytes_used + space_info->bytes_reserved +
+               space_info->bytes_pinned + space_info->bytes_readonly +
+               space_info->bytes_may_use;
+
+       spin_lock(&root->fs_info->free_chunk_lock);
+       avail = root->fs_info->free_chunk_space;
+       spin_unlock(&root->fs_info->free_chunk_lock);
+
+       /*
+        * If we have dup, raid1 or raid10 then only half of the free
+        * space is actually useable.
+        */
+       if (profile & (BTRFS_BLOCK_GROUP_DUP |
+                      BTRFS_BLOCK_GROUP_RAID1 |
+                      BTRFS_BLOCK_GROUP_RAID10))
+               avail >>= 1;
+
+       /*
+        * If we aren't flushing don't let us overcommit too much, say
+        * 1/8th of the space.  If we can flush, let it overcommit up to
+        * 1/2 of the space.
+        */
+       if (flush)
+               avail >>= 3;
+       else
+               avail >>= 1;
+
+       if (used + bytes < space_info->total_bytes + avail)
+               return 1;
+       return 0;
+}
+
 /*
  * shrink metadata reservation for delalloc
  */
@@ -3693,7 +3723,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
        if (delalloc_bytes == 0) {
                if (trans)
                        return;
-               btrfs_wait_ordered_extents(root, 0, 0);
+               btrfs_wait_ordered_extents(root, 0);
                return;
        }
 
@@ -3703,11 +3733,15 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
                                               WB_REASON_FS_FREE_SPACE);
 
+               /*
+                * We need to wait for the async pages to actually start before
+                * we do anything.
+                */
+               wait_event(root->fs_info->async_submit_wait,
+                          !atomic_read(&root->fs_info->async_delalloc_pages));
+
                spin_lock(&space_info->lock);
-               if (space_info->bytes_used + space_info->bytes_reserved +
-                   space_info->bytes_pinned + space_info->bytes_readonly +
-                   space_info->bytes_may_use + orig <=
-                   space_info->total_bytes) {
+               if (can_overcommit(root, space_info, orig, !trans)) {
                        spin_unlock(&space_info->lock);
                        break;
                }
@@ -3715,7 +3749,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 
                loops++;
                if (wait_ordered && !trans) {
-                       btrfs_wait_ordered_extents(root, 0, 0);
+                       btrfs_wait_ordered_extents(root, 0);
                } else {
                        time_left = schedule_timeout_killable(1);
                        if (time_left)
@@ -3784,11 +3818,12 @@ commit:
 }
 
 enum flush_state {
-       FLUSH_DELALLOC          =       1,
-       FLUSH_DELALLOC_WAIT     =       2,
-       FLUSH_DELAYED_ITEMS_NR  =       3,
-       FLUSH_DELAYED_ITEMS     =       4,
-       COMMIT_TRANS            =       5,
+       FLUSH_DELAYED_ITEMS_NR  =       1,
+       FLUSH_DELAYED_ITEMS     =       2,
+       FLUSH_DELALLOC          =       3,
+       FLUSH_DELALLOC_WAIT     =       4,
+       ALLOC_CHUNK             =       5,
+       COMMIT_TRANS            =       6,
 };
 
 static int flush_space(struct btrfs_root *root,
@@ -3800,11 +3835,6 @@ static int flush_space(struct btrfs_root *root,
        int ret = 0;
 
        switch (state) {
-       case FLUSH_DELALLOC:
-       case FLUSH_DELALLOC_WAIT:
-               shrink_delalloc(root, num_bytes, orig_bytes,
-                               state == FLUSH_DELALLOC_WAIT);
-               break;
        case FLUSH_DELAYED_ITEMS_NR:
        case FLUSH_DELAYED_ITEMS:
                if (state == FLUSH_DELAYED_ITEMS_NR) {
@@ -3825,6 +3855,24 @@ static int flush_space(struct btrfs_root *root,
                ret = btrfs_run_delayed_items_nr(trans, root, nr);
                btrfs_end_transaction(trans, root);
                break;
+       case FLUSH_DELALLOC:
+       case FLUSH_DELALLOC_WAIT:
+               shrink_delalloc(root, num_bytes, orig_bytes,
+                               state == FLUSH_DELALLOC_WAIT);
+               break;
+       case ALLOC_CHUNK:
+               trans = btrfs_join_transaction(root);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       break;
+               }
+               ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                                    btrfs_get_alloc_profile(root, 0),
+                                    CHUNK_ALLOC_NO_FORCE);
+               btrfs_end_transaction(trans, root);
+               if (ret == -ENOSPC)
+                       ret = 0;
+               break;
        case COMMIT_TRANS:
                ret = may_commit_transaction(root, space_info, orig_bytes, 0);
                break;
@@ -3856,10 +3904,9 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
        struct btrfs_space_info *space_info = block_rsv->space_info;
        u64 used;
        u64 num_bytes = orig_bytes;
-       int flush_state = FLUSH_DELALLOC;
+       int flush_state = FLUSH_DELAYED_ITEMS_NR;
        int ret = 0;
        bool flushing = false;
-       bool committed = false;
 
 again:
        ret = 0;
@@ -3922,57 +3969,12 @@ again:
                        (orig_bytes * 2);
        }
 
-       if (ret) {
-               u64 profile = btrfs_get_alloc_profile(root, 0);
-               u64 avail;
-
-               /*
-                * If we have a lot of space that's pinned, don't bother doing
-                * the overcommit dance yet and just commit the transaction.
-                */
-               avail = (space_info->total_bytes - space_info->bytes_used) * 8;
-               do_div(avail, 10);
-               if (space_info->bytes_pinned >= avail && flush && !committed) {
-                       space_info->flush = 1;
-                       flushing = true;
-                       spin_unlock(&space_info->lock);
-                       ret = may_commit_transaction(root, space_info,
-                                                    orig_bytes, 1);
-                       if (ret)
-                               goto out;
-                       committed = true;
-                       goto again;
-               }
-
-               spin_lock(&root->fs_info->free_chunk_lock);
-               avail = root->fs_info->free_chunk_space;
-
-               /*
-                * If we have dup, raid1 or raid10 then only half of the free
-                * space is actually useable.
-                */
-               if (profile & (BTRFS_BLOCK_GROUP_DUP |
-                              BTRFS_BLOCK_GROUP_RAID1 |
-                              BTRFS_BLOCK_GROUP_RAID10))
-                       avail >>= 1;
-
-               /*
-                * If we aren't flushing don't let us overcommit too much, say
-                * 1/8th of the space.  If we can flush, let it overcommit up to
-                * 1/2 of the space.
-                */
-               if (flush)
-                       avail >>= 3;
-               else
-                       avail >>= 1;
-                spin_unlock(&root->fs_info->free_chunk_lock);
-
-               if (used + num_bytes < space_info->total_bytes + avail) {
-                       space_info->bytes_may_use += orig_bytes;
-                       trace_btrfs_space_reservation(root->fs_info,
-                               "space_info", space_info->flags, orig_bytes, 1);
-                       ret = 0;
-               }
+       if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
+               space_info->bytes_may_use += orig_bytes;
+               trace_btrfs_space_reservation(root->fs_info, "space_info",
+                                             space_info->flags, orig_bytes,
+                                             1);
+               ret = 0;
        }
 
        /*
@@ -4114,13 +4116,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
        return 0;
 }
 
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
 {
        memset(rsv, 0, sizeof(*rsv));
        spin_lock_init(&rsv->lock);
+       rsv->type = type;
 }
 
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+                                             unsigned short type)
 {
        struct btrfs_block_rsv *block_rsv;
        struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4129,7 +4133,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
        if (!block_rsv)
                return NULL;
 
-       btrfs_init_block_rsv(block_rsv);
+       btrfs_init_block_rsv(block_rsv, type);
        block_rsv->space_info = __find_space_info(fs_info,
                                                  BTRFS_BLOCK_GROUP_METADATA);
        return block_rsv;
@@ -4138,6 +4142,8 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 void btrfs_free_block_rsv(struct btrfs_root *root,
                          struct btrfs_block_rsv *rsv)
 {
+       if (!rsv)
+               return;
        btrfs_block_rsv_release(root, rsv, (u64)-1);
        kfree(rsv);
 }
@@ -4416,10 +4422,10 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
        struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
        /*
-        * two for root back/forward refs, two for directory entries
-        * and one for root of the snapshot.
+        * two for root back/forward refs, two for directory entries,
+        * one for root of the snapshot and one for parent inode.
         */
-       u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
+       u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
        dst_rsv->space_info = src_rsv->space_info;
        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
@@ -5018,7 +5024,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
        while (1) {
                ret = find_first_extent_bit(unpin, 0, &start, &end,
-                                           EXTENT_DIRTY);
+                                           EXTENT_DIRTY, NULL);
                if (ret)
                        break;
 
@@ -5096,8 +5102,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        ret = remove_extent_backref(trans, extent_root, path,
                                                    NULL, refs_to_drop,
                                                    is_data);
-                       if (ret)
-                               goto abort;
+                       if (ret) {
+                               btrfs_abort_transaction(trans, extent_root, ret);
+                               goto out;
+                       }
                        btrfs_release_path(path);
                        path->leave_spinning = 1;
 
@@ -5115,8 +5123,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                        btrfs_print_leaf(extent_root,
                                                         path->nodes[0]);
                        }
-                       if (ret < 0)
-                               goto abort;
+                       if (ret < 0) {
+                               btrfs_abort_transaction(trans, extent_root, ret);
+                               goto out;
+                       }
                        extent_slot = path->slots[0];
                }
        } else if (ret == -ENOENT) {
@@ -5130,7 +5140,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                       (unsigned long long)owner_objectid,
                       (unsigned long long)owner_offset);
        } else {
-               goto abort;
+               btrfs_abort_transaction(trans, extent_root, ret);
+               goto out;
        }
 
        leaf = path->nodes[0];
@@ -5140,8 +5151,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                BUG_ON(found_extent || extent_slot != path->slots[0]);
                ret = convert_extent_item_v0(trans, extent_root, path,
                                             owner_objectid, 0);
-               if (ret < 0)
-                       goto abort;
+               if (ret < 0) {
+                       btrfs_abort_transaction(trans, extent_root, ret);
+                       goto out;
+               }
 
                btrfs_release_path(path);
                path->leave_spinning = 1;
@@ -5158,8 +5171,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                               (unsigned long long)bytenr);
                        btrfs_print_leaf(extent_root, path->nodes[0]);
                }
-               if (ret < 0)
-                       goto abort;
+               if (ret < 0) {
+                       btrfs_abort_transaction(trans, extent_root, ret);
+                       goto out;
+               }
+
                extent_slot = path->slots[0];
                leaf = path->nodes[0];
                item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -5196,8 +5212,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        ret = remove_extent_backref(trans, extent_root, path,
                                                    iref, refs_to_drop,
                                                    is_data);
-                       if (ret)
-                               goto abort;
+                       if (ret) {
+                               btrfs_abort_transaction(trans, extent_root, ret);
+                               goto out;
+                       }
                }
        } else {
                if (found_extent) {
@@ -5214,27 +5232,29 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 
                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
-               if (ret)
-                       goto abort;
+               if (ret) {
+                       btrfs_abort_transaction(trans, extent_root, ret);
+                       goto out;
+               }
                btrfs_release_path(path);
 
                if (is_data) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
-                       if (ret)
-                               goto abort;
+                       if (ret) {
+                               btrfs_abort_transaction(trans, extent_root, ret);
+                               goto out;
+                       }
                }
 
                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
-               if (ret)
-                       goto abort;
+               if (ret) {
+                       btrfs_abort_transaction(trans, extent_root, ret);
+                       goto out;
+               }
        }
 out:
        btrfs_free_path(path);
        return ret;
-
-abort:
-       btrfs_abort_transaction(trans, extent_root, ret);
-       goto out;
 }
 
 /*
@@ -5497,8 +5517,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_block_group_cache *used_block_group;
        u64 search_start = 0;
        int empty_cluster = 2 * 1024 * 1024;
-       int allowed_chunk_alloc = 0;
-       int done_chunk_alloc = 0;
        struct btrfs_space_info *space_info;
        int loop = 0;
        int index = 0;
@@ -5530,9 +5548,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        if (btrfs_mixed_space_info(space_info))
                use_cluster = false;
 
-       if (orig_root->ref_cows || empty_size)
-               allowed_chunk_alloc = 1;
-
        if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
                last_ptr = &root->fs_info->meta_alloc_cluster;
                if (!btrfs_test_opt(root, SSD))
@@ -5806,10 +5821,6 @@ checks:
 
                trace_btrfs_reserve_extent(orig_root, block_group,
                                           search_start, num_bytes);
-               if (offset < search_start)
-                       btrfs_add_free_space(used_block_group, offset,
-                                            search_start - offset);
-               BUG_ON(offset > search_start);
                if (used_block_group != block_group)
                        btrfs_put_block_group(used_block_group);
                btrfs_put_block_group(block_group);
@@ -5842,34 +5853,17 @@ loop:
                index = 0;
                loop++;
                if (loop == LOOP_ALLOC_CHUNK) {
-                      if (allowed_chunk_alloc) {
-                               ret = do_chunk_alloc(trans, root, num_bytes +
-                                                    2 * 1024 * 1024, data,
-                                                    CHUNK_ALLOC_LIMITED);
-                               /*
-                                * Do not bail out on ENOSPC since we
-                                * can do more things.
-                                */
-                               if (ret < 0 && ret != -ENOSPC) {
-                                       btrfs_abort_transaction(trans,
-                                                               root, ret);
-                                       goto out;
-                               }
-                               allowed_chunk_alloc = 0;
-                               if (ret == 1)
-                                       done_chunk_alloc = 1;
-                       } else if (!done_chunk_alloc &&
-                                  space_info->force_alloc ==
-                                  CHUNK_ALLOC_NO_FORCE) {
-                               space_info->force_alloc = CHUNK_ALLOC_LIMITED;
+                       ret = do_chunk_alloc(trans, root, data,
+                                            CHUNK_ALLOC_FORCE);
+                       /*
+                        * Do not bail out on ENOSPC since we
+                        * can do more things.
+                        */
+                       if (ret < 0 && ret != -ENOSPC) {
+                               btrfs_abort_transaction(trans,
+                                                       root, ret);
+                               goto out;
                        }
-
-                      /*
-                       * We didn't allocate a chunk, go ahead and drop the
-                       * empty size and loop again.
-                       */
-                      if (!done_chunk_alloc)
-                              loop = LOOP_NO_EMPTY_SIZE;
                }
 
                if (loop == LOOP_NO_EMPTY_SIZE) {
@@ -5944,20 +5938,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 
        data = btrfs_get_alloc_profile(root, data);
 again:
-       /*
-        * the only place that sets empty_size is btrfs_realloc_node, which
-        * is not called recursively on allocations
-        */
-       if (empty_size || root->ref_cows) {
-               ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                    num_bytes + 2 * 1024 * 1024, data,
-                                    CHUNK_ALLOC_NO_FORCE);
-               if (ret < 0 && ret != -ENOSPC) {
-                       btrfs_abort_transaction(trans, root, ret);
-                       return ret;
-               }
-       }
-
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(trans, root, num_bytes, empty_size,
                               hint_byte, ins, data);
@@ -5967,12 +5947,6 @@ again:
                        num_bytes = num_bytes >> 1;
                        num_bytes = num_bytes & ~(root->sectorsize - 1);
                        num_bytes = max(num_bytes, min_alloc_size);
-                       ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                      num_bytes, data, CHUNK_ALLOC_FORCE);
-                       if (ret < 0 && ret != -ENOSPC) {
-                               btrfs_abort_transaction(trans, root, ret);
-                               return ret;
-                       }
                        if (num_bytes == min_alloc_size)
                                final_tried = true;
                        goto again;
@@ -6314,7 +6288,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        ret = block_rsv_use_bytes(block_rsv, blocksize);
        if (!ret)
                return block_rsv;
-       if (ret) {
+       if (ret && !block_rsv->failfast) {
                static DEFINE_RATELIMIT_STATE(_rs,
                                DEFAULT_RATELIMIT_INTERVAL,
                                /*DEFAULT_RATELIMIT_BURST*/ 2);
@@ -7279,7 +7253,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 
        alloc_flags = update_block_group_flags(root, cache->flags);
        if (alloc_flags != cache->flags) {
-               ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+               ret = do_chunk_alloc(trans, root, alloc_flags,
                                     CHUNK_ALLOC_FORCE);
                if (ret < 0)
                        goto out;
@@ -7289,7 +7263,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
        if (!ret)
                goto out;
        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
-       ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+       ret = do_chunk_alloc(trans, root, alloc_flags,
                             CHUNK_ALLOC_FORCE);
        if (ret < 0)
                goto out;
@@ -7303,7 +7277,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, u64 type)
 {
        u64 alloc_flags = get_alloc_profile(root, type);
-       return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+       return do_chunk_alloc(trans, root, alloc_flags,
                              CHUNK_ALLOC_FORCE);
 }
 
@@ -7810,6 +7784,34 @@ error:
        return ret;
 }
 
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root)
+{
+       struct btrfs_block_group_cache *block_group, *tmp;
+       struct btrfs_root *extent_root = root->fs_info->extent_root;
+       struct btrfs_block_group_item item;
+       struct btrfs_key key;
+       int ret = 0;
+
+       list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
+                                new_bg_list) {
+               list_del_init(&block_group->new_bg_list);
+
+               if (ret)
+                       continue;
+
+               spin_lock(&block_group->lock);
+               memcpy(&item, &block_group->item, sizeof(item));
+               memcpy(&key, &block_group->key, sizeof(key));
+               spin_unlock(&block_group->lock);
+
+               ret = btrfs_insert_item(trans, extent_root, &key, &item,
+                                       sizeof(item));
+               if (ret)
+                       btrfs_abort_transaction(trans, extent_root, ret);
+       }
+}
+
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, u64 bytes_used,
                           u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -7843,6 +7845,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        spin_lock_init(&cache->lock);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
+       INIT_LIST_HEAD(&cache->new_bg_list);
 
        btrfs_init_free_space_ctl(cache);
 
@@ -7874,12 +7877,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        ret = btrfs_add_block_group_cache(root->fs_info, cache);
        BUG_ON(ret); /* Logic error */
 
-       ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
-                               sizeof(cache->item));
-       if (ret) {
-               btrfs_abort_transaction(trans, extent_root, ret);
-               return ret;
-       }
+       list_add_tail(&cache->new_bg_list, &trans->new_bgs);
 
        set_avail_alloc_bits(extent_root->fs_info, type);
 
index b08ea4717e9d70ef7967fe33c1658b669eee16ac..8036d3a848530daed167453ebfb68eb8a83950f3 100644 (file)
@@ -45,6 +45,7 @@ struct extent_page_data {
        struct bio *bio;
        struct extent_io_tree *tree;
        get_extent_t *get_extent;
+       unsigned long bio_flags;
 
        /* tells writepage not to lock the state bits for this range
         * it still does the unlocking
@@ -64,13 +65,13 @@ tree_fs_info(struct extent_io_tree *tree)
 
 int __init extent_io_init(void)
 {
-       extent_state_cache = kmem_cache_create("extent_state",
+       extent_state_cache = kmem_cache_create("btrfs_extent_state",
                        sizeof(struct extent_state), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!extent_state_cache)
                return -ENOMEM;
 
-       extent_buffer_cache = kmem_cache_create("extent_buffers",
+       extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
                        sizeof(struct extent_buffer), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!extent_buffer_cache)
@@ -942,6 +943,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
  * @end:       the end offset in bytes (inclusive)
  * @bits:      the bits to set in this range
  * @clear_bits:        the bits to clear in this range
+ * @cached_state:      state that we're going to cache
  * @mask:      the allocation mask
  *
  * This will go through and set bits for the given range.  If any states exist
@@ -951,7 +953,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
  * boundary bits like LOCK.
  */
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                      int bits, int clear_bits, gfp_t mask)
+                      int bits, int clear_bits,
+                      struct extent_state **cached_state, gfp_t mask)
 {
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
@@ -968,6 +971,15 @@ again:
        }
 
        spin_lock(&tree->lock);
+       if (cached_state && *cached_state) {
+               state = *cached_state;
+               if (state->start <= start && state->end > start &&
+                   state->tree) {
+                       node = &state->rb_node;
+                       goto hit_next;
+               }
+       }
+
        /*
         * this search will find all the extents that end after
         * our range starts.
@@ -998,6 +1010,7 @@ hit_next:
         */
        if (state->start == start && state->end <= end) {
                set_state_bits(tree, state, &bits);
+               cache_state(state, cached_state);
                state = clear_state_bit(tree, state, &clear_bits, 0);
                if (last_end == (u64)-1)
                        goto out;
@@ -1038,6 +1051,7 @@ hit_next:
                        goto out;
                if (state->end <= end) {
                        set_state_bits(tree, state, &bits);
+                       cache_state(state, cached_state);
                        state = clear_state_bit(tree, state, &clear_bits, 0);
                        if (last_end == (u64)-1)
                                goto out;
@@ -1076,6 +1090,7 @@ hit_next:
                                   &bits);
                if (err)
                        extent_io_tree_panic(tree, err);
+               cache_state(prealloc, cached_state);
                prealloc = NULL;
                start = this_end + 1;
                goto search_again;
@@ -1098,6 +1113,7 @@ hit_next:
                        extent_io_tree_panic(tree, err);
 
                set_state_bits(tree, prealloc, &bits);
+               cache_state(prealloc, cached_state);
                clear_state_bit(tree, prealloc, &clear_bits, 0);
                prealloc = NULL;
                goto out;
@@ -1150,6 +1166,14 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                              NULL, cached_state, mask);
 }
 
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+                     struct extent_state **cached_state, gfp_t mask)
+{
+       return set_extent_bit(tree, start, end,
+                             EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+                             NULL, cached_state, mask);
+}
+
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                       gfp_t mask)
 {
@@ -1294,18 +1318,42 @@ out:
  * If nothing was found, 1 is returned. If found something, return 0.
  */
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-                         u64 *start_ret, u64 *end_ret, int bits)
+                         u64 *start_ret, u64 *end_ret, int bits,
+                         struct extent_state **cached_state)
 {
        struct extent_state *state;
+       struct rb_node *n;
        int ret = 1;
 
        spin_lock(&tree->lock);
+       if (cached_state && *cached_state) {
+               state = *cached_state;
+               if (state->end == start - 1 && state->tree) {
+                       n = rb_next(&state->rb_node);
+                       while (n) {
+                               state = rb_entry(n, struct extent_state,
+                                                rb_node);
+                               if (state->state & bits)
+                                       goto got_it;
+                               n = rb_next(n);
+                       }
+                       free_extent_state(*cached_state);
+                       *cached_state = NULL;
+                       goto out;
+               }
+               free_extent_state(*cached_state);
+               *cached_state = NULL;
+       }
+
        state = find_first_extent_bit_state(tree, start, bits);
+got_it:
        if (state) {
+               cache_state(state, cached_state);
                *start_ret = state->start;
                *end_ret = state->end;
                ret = 0;
        }
+out:
        spin_unlock(&tree->lock);
        return ret;
 }
@@ -2068,7 +2116,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
                }
                read_unlock(&em_tree->lock);
 
-               if (!em || IS_ERR(em)) {
+               if (!em) {
                        kfree(failrec);
                        return -EIO;
                }
@@ -2304,8 +2352,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                struct extent_state *cached = NULL;
                struct extent_state *state;
 
-               pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
-                        "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
+               pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
+                        "mirror=%ld\n", (u64)bio->bi_sector, err,
                         (long int)bio->bi_bdev);
                tree = &BTRFS_I(page->mapping->host)->io_tree;
 
@@ -2709,12 +2757,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                                         end_bio_extent_readpage, mirror_num,
                                         *bio_flags,
                                         this_bio_flag);
-                       BUG_ON(ret == -ENOMEM);
-                       nr++;
-                       *bio_flags = this_bio_flag;
+                       if (!ret) {
+                               nr++;
+                               *bio_flags = this_bio_flag;
+                       }
                }
-               if (ret)
+               if (ret) {
                        SetPageError(page);
+                       unlock_extent(tree, cur, cur + iosize - 1);
+               }
                cur = cur + iosize;
                pg_offset += iosize;
        }
@@ -3161,12 +3212,16 @@ static int write_one_eb(struct extent_buffer *eb,
        struct block_device *bdev = fs_info->fs_devices->latest_bdev;
        u64 offset = eb->start;
        unsigned long i, num_pages;
+       unsigned long bio_flags = 0;
        int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
        int ret = 0;
 
        clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
        num_pages = num_extent_pages(eb->start, eb->len);
        atomic_set(&eb->io_pages, num_pages);
+       if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
+               bio_flags = EXTENT_BIO_TREE_LOG;
+
        for (i = 0; i < num_pages; i++) {
                struct page *p = extent_buffer_page(eb, i);
 
@@ -3175,7 +3230,8 @@ static int write_one_eb(struct extent_buffer *eb,
                ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
                                         PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
                                         -1, end_bio_extent_buffer_writepage,
-                                        0, 0, 0);
+                                        0, epd->bio_flags, bio_flags);
+               epd->bio_flags = bio_flags;
                if (ret) {
                        set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
                        SetPageError(p);
@@ -3210,6 +3266,7 @@ int btree_write_cache_pages(struct address_space *mapping,
                .tree = tree,
                .extent_locked = 0,
                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+               .bio_flags = 0,
        };
        int ret = 0;
        int done = 0;
@@ -3254,19 +3311,34 @@ retry:
                                break;
                        }
 
+                       spin_lock(&mapping->private_lock);
+                       if (!PagePrivate(page)) {
+                               spin_unlock(&mapping->private_lock);
+                               continue;
+                       }
+
                        eb = (struct extent_buffer *)page->private;
+
+                       /*
+                        * Shouldn't happen and normally this would be a BUG_ON
+                        * but no sense in crashing the users box for something
+                        * we can survive anyway.
+                        */
                        if (!eb) {
+                               spin_unlock(&mapping->private_lock);
                                WARN_ON(1);
                                continue;
                        }
 
-                       if (eb == prev_eb)
+                       if (eb == prev_eb) {
+                               spin_unlock(&mapping->private_lock);
                                continue;
+                       }
 
-                       if (!atomic_inc_not_zero(&eb->refs)) {
-                               WARN_ON(1);
+                       ret = atomic_inc_not_zero(&eb->refs);
+                       spin_unlock(&mapping->private_lock);
+                       if (!ret)
                                continue;
-                       }
 
                        prev_eb = eb;
                        ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
@@ -3457,7 +3529,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
                if (epd->sync_io)
                        rw = WRITE_SYNC;
 
-               ret = submit_one_bio(rw, epd->bio, 0, 0);
+               ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
                BUG_ON(ret < 0); /* -ENOMEM */
                epd->bio = NULL;
        }
@@ -3480,6 +3552,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                .get_extent = get_extent,
                .extent_locked = 0,
                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+               .bio_flags = 0,
        };
 
        ret = __extent_writepage(page, wbc, &epd);
@@ -3504,6 +3577,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
                .get_extent = get_extent,
                .extent_locked = 1,
                .sync_io = mode == WB_SYNC_ALL,
+               .bio_flags = 0,
        };
        struct writeback_control wbc_writepages = {
                .sync_mode      = mode,
@@ -3543,6 +3617,7 @@ int extent_writepages(struct extent_io_tree *tree,
                .get_extent = get_extent,
                .extent_locked = 0,
                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+               .bio_flags = 0,
        };
 
        ret = extent_write_cache_pages(tree, mapping, wbc,
@@ -3920,18 +3995,6 @@ out:
        return ret;
 }
 
-inline struct page *extent_buffer_page(struct extent_buffer *eb,
-                                             unsigned long i)
-{
-       return eb->pages[i];
-}
-
-inline unsigned long num_extent_pages(u64 start, u64 len)
-{
-       return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
-               (start >> PAGE_CACHE_SHIFT);
-}
-
 static void __free_extent_buffer(struct extent_buffer *eb)
 {
 #if LEAK_DEBUG
@@ -4047,7 +4110,7 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
 
        return eb;
 err:
-       for (i--; i > 0; i--)
+       for (i--; i >= 0; i--)
                __free_page(eb->pages[i]);
        __free_extent_buffer(eb);
        return NULL;
@@ -4192,10 +4255,8 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 
        for (i = 0; i < num_pages; i++, index++) {
                p = find_or_create_page(mapping, index, GFP_NOFS);
-               if (!p) {
-                       WARN_ON(1);
+               if (!p)
                        goto free_eb;
-               }
 
                spin_lock(&mapping->private_lock);
                if (PagePrivate(p)) {
@@ -4338,7 +4399,6 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
 
                /* Should be safe to release our pages at this point */
                btrfs_release_extent_buffer_page(eb, 0);
-
                call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
                return 1;
        }
index 25900af5b15d43e6bdfe0cef7c865ac2aa81bd36..711d12b80028b701033d7a326b28ed18b43e37ff 100644 (file)
@@ -27,6 +27,7 @@
  * type for this bio
  */
 #define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_TREE_LOG 2
 #define EXTENT_BIO_FLAG_SHIFT 16
 
 /* these are bit numbers for test/set bit */
@@ -232,11 +233,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                       gfp_t mask);
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                      int bits, int clear_bits, gfp_t mask);
+                      int bits, int clear_bits,
+                      struct extent_state **cached_state, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask);
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+                     struct extent_state **cached_state, gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-                         u64 *start_ret, u64 *end_ret, int bits);
+                         u64 *start_ret, u64 *end_ret, int bits,
+                         struct extent_state **cached_state);
 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
                                                 u64 start, int bits);
 int extent_invalidatepage(struct extent_io_tree *tree,
@@ -277,8 +282,18 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
 int read_extent_buffer_pages(struct extent_io_tree *tree,
                             struct extent_buffer *eb, u64 start, int wait,
                             get_extent_t *get_extent, int mirror_num);
-unsigned long num_extent_pages(u64 start, u64 len);
-struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
+
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+       return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+               (start >> PAGE_CACHE_SHIFT);
+}
+
+static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+                                             unsigned long i)
+{
+       return eb->pages[i];
+}
 
 static inline void extent_buffer_get(struct extent_buffer *eb)
 {
index 7c97b330145981d241cec9f13a75b3fa1869f8db..b8cbc8d5c7f7cb39ed770fb2b256427f25bb9c69 100644 (file)
@@ -11,7 +11,7 @@ static struct kmem_cache *extent_map_cache;
 
 int __init extent_map_init(void)
 {
-       extent_map_cache = kmem_cache_create("extent_map",
+       extent_map_cache = kmem_cache_create("btrfs_extent_map",
                        sizeof(struct extent_map), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!extent_map_cache)
@@ -35,6 +35,7 @@ void extent_map_exit(void)
 void extent_map_tree_init(struct extent_map_tree *tree)
 {
        tree->map = RB_ROOT;
+       INIT_LIST_HEAD(&tree->modified_extents);
        rwlock_init(&tree->lock);
 }
 
@@ -54,7 +55,9 @@ struct extent_map *alloc_extent_map(void)
        em->in_tree = 0;
        em->flags = 0;
        em->compress_type = BTRFS_COMPRESS_NONE;
+       em->generation = 0;
        atomic_set(&em->refs, 1);
+       INIT_LIST_HEAD(&em->list);
        return em;
 }
 
@@ -72,6 +75,7 @@ void free_extent_map(struct extent_map *em)
        WARN_ON(atomic_read(&em->refs) == 0);
        if (atomic_dec_and_test(&em->refs)) {
                WARN_ON(em->in_tree);
+               WARN_ON(!list_empty(&em->list));
                kmem_cache_free(extent_map_cache, em);
        }
 }
@@ -198,6 +202,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
                        em->block_len += merge->block_len;
                        em->block_start = merge->block_start;
                        merge->in_tree = 0;
+                       if (merge->generation > em->generation) {
+                               em->mod_start = em->start;
+                               em->mod_len = em->len;
+                               em->generation = merge->generation;
+                               list_move(&em->list, &tree->modified_extents);
+                       }
+
+                       list_del_init(&merge->list);
                        rb_erase(&merge->rb_node, &tree->map);
                        free_extent_map(merge);
                }
@@ -211,14 +223,34 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
                em->block_len += merge->len;
                rb_erase(&merge->rb_node, &tree->map);
                merge->in_tree = 0;
+               if (merge->generation > em->generation) {
+                       em->mod_len = em->len;
+                       em->generation = merge->generation;
+                       list_move(&em->list, &tree->modified_extents);
+               }
+               list_del_init(&merge->list);
                free_extent_map(merge);
        }
 }
 
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+/**
+ * unpint_extent_cache - unpin an extent from the cache
+ * @tree:      tree to unpin the extent in
+ * @start:     logical offset in the file
+ * @len:       length of the extent
+ * @gen:       generation that this extent has been modified in
+ * @prealloc:  if this is set we need to clear the prealloc flag
+ *
+ * Called after an extent has been written to disk properly.  Set the generation
+ * to the generation that actually added the file item to the inode so we know
+ * we need to sync this extent when we call fsync().
+ */
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
+                      u64 gen)
 {
        int ret = 0;
        struct extent_map *em;
+       bool prealloc = false;
 
        write_lock(&tree->lock);
        em = lookup_extent_mapping(tree, start, len);
@@ -228,10 +260,24 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
        if (!em)
                goto out;
 
+       list_move(&em->list, &tree->modified_extents);
+       em->generation = gen;
        clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+       em->mod_start = em->start;
+       em->mod_len = em->len;
+
+       if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+               prealloc = true;
+               clear_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+       }
 
        try_merge_map(tree, em);
 
+       if (prealloc) {
+               em->mod_start = em->start;
+               em->mod_len = em->len;
+       }
+
        free_extent_map(em);
 out:
        write_unlock(&tree->lock);
@@ -269,6 +315,9 @@ int add_extent_mapping(struct extent_map_tree *tree,
        }
        atomic_inc(&em->refs);
 
+       em->mod_start = em->start;
+       em->mod_len = em->len;
+
        try_merge_map(tree, em);
 out:
        return ret;
@@ -358,6 +407,8 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 
        WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
        rb_erase(&em->rb_node, &tree->map);
+       if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+               list_del_init(&em->list);
        em->in_tree = 0;
        return ret;
 }
index 1195f09761fedd2e5c109b74d3000c2973bc75f4..679225555f7b597b91012e5a47434a72e1774c91 100644 (file)
@@ -13,6 +13,7 @@
 #define EXTENT_FLAG_COMPRESSED 1
 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
 #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
+#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
 
 struct extent_map {
        struct rb_node rb_node;
@@ -20,18 +21,23 @@ struct extent_map {
        /* all of these are in bytes */
        u64 start;
        u64 len;
+       u64 mod_start;
+       u64 mod_len;
        u64 orig_start;
        u64 block_start;
        u64 block_len;
+       u64 generation;
        unsigned long flags;
        struct block_device *bdev;
        atomic_t refs;
        unsigned int in_tree;
        unsigned int compress_type;
+       struct list_head list;
 };
 
 struct extent_map_tree {
        struct rb_root map;
+       struct list_head modified_extents;
        rwlock_t lock;
 };
 
@@ -60,7 +66,7 @@ struct extent_map *alloc_extent_map(void);
 void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
 void extent_map_exit(void);
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
                                         u64 start, u64 len);
 #endif
index 857d93cd01dc579eb46838624349442fef07cf00..1ad08e4e4a15fa25ea7f65397cbc1e154fa1f4b9 100644 (file)
 #include "transaction.h"
 #include "print-tree.h"
 
-#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
                                   sizeof(struct btrfs_item) * 2) / \
                                  size) - 1))
 
-#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE))
+#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
+                                      PAGE_CACHE_SIZE))
 
 #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
                                   sizeof(struct btrfs_ordered_sum)) / \
index f6b40e86121b007bf06954268254bcaae0fe7bf5..9ab1bed88116bd8f412f095aaef231f5bb749baf 100644 (file)
@@ -39,6 +39,7 @@
 #include "tree-log.h"
 #include "locking.h"
 #include "compat.h"
+#include "volumes.h"
 
 /*
  * when auto defrag is enabled we
@@ -458,14 +459,15 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
  * this drops all the extents in the cache that intersect the range
  * [start, end].  Existing extents are split as required.
  */
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
-                           int skip_pinned)
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+                            int skip_pinned)
 {
        struct extent_map *em;
        struct extent_map *split = NULL;
        struct extent_map *split2 = NULL;
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        u64 len = end - start + 1;
+       u64 gen;
        int ret;
        int testend = 1;
        unsigned long flags;
@@ -477,11 +479,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                testend = 0;
        }
        while (1) {
+               int no_splits = 0;
+
                if (!split)
                        split = alloc_extent_map();
                if (!split2)
                        split2 = alloc_extent_map();
-               BUG_ON(!split || !split2); /* -ENOMEM */
+               if (!split || !split2)
+                       no_splits = 1;
 
                write_lock(&em_tree->lock);
                em = lookup_extent_mapping(em_tree, start, len);
@@ -490,6 +495,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        break;
                }
                flags = em->flags;
+               gen = em->generation;
                if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
                        if (testend && em->start + em->len >= start + len) {
                                free_extent_map(em);
@@ -506,6 +512,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
                remove_extent_mapping(em_tree, em);
+               if (no_splits)
+                       goto next;
 
                if (em->block_start < EXTENT_MAP_LAST_BYTE &&
                    em->start < start) {
@@ -518,12 +526,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                                split->block_len = em->block_len;
                        else
                                split->block_len = split->len;
-
+                       split->generation = gen;
                        split->bdev = em->bdev;
                        split->flags = flags;
                        split->compress_type = em->compress_type;
                        ret = add_extent_mapping(em_tree, split);
                        BUG_ON(ret); /* Logic error */
+                       list_move(&split->list, &em_tree->modified_extents);
                        free_extent_map(split);
                        split = split2;
                        split2 = NULL;
@@ -537,6 +546,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->bdev = em->bdev;
                        split->flags = flags;
                        split->compress_type = em->compress_type;
+                       split->generation = gen;
 
                        if (compressed) {
                                split->block_len = em->block_len;
@@ -550,9 +560,11 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 
                        ret = add_extent_mapping(em_tree, split);
                        BUG_ON(ret); /* Logic error */
+                       list_move(&split->list, &em_tree->modified_extents);
                        free_extent_map(split);
                        split = NULL;
                }
+next:
                write_unlock(&em_tree->lock);
 
                /* once for us */
@@ -564,7 +576,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                free_extent_map(split);
        if (split2)
                free_extent_map(split2);
-       return 0;
 }
 
 /*
@@ -576,13 +587,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
  * it is either truncated or split.  Anything entirely inside the range
  * is deleted from the tree.
  */
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
-                      u64 start, u64 end, u64 *hint_byte, int drop_cache)
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root, struct inode *inode,
+                        struct btrfs_path *path, u64 start, u64 end,
+                        u64 *drop_end, int drop_cache)
 {
-       struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *fi;
-       struct btrfs_path *path;
        struct btrfs_key key;
        struct btrfs_key new_key;
        u64 ino = btrfs_ino(inode);
@@ -597,14 +608,12 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
        int recow;
        int ret;
        int modify_tree = -1;
+       int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
+       int found = 0;
 
        if (drop_cache)
                btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-
        if (start >= BTRFS_I(inode)->disk_i_size)
                modify_tree = 0;
 
@@ -666,6 +675,7 @@ next_slot:
                        goto next_slot;
                }
 
+               found = 1;
                search_start = max(key.offset, start);
                if (recow || !modify_tree) {
                        modify_tree = -1;
@@ -707,14 +717,13 @@ next_slot:
                                                        extent_end - start);
                        btrfs_mark_buffer_dirty(leaf);
 
-                       if (disk_bytenr > 0) {
+                       if (update_refs && disk_bytenr > 0) {
                                ret = btrfs_inc_extent_ref(trans, root,
                                                disk_bytenr, num_bytes, 0,
                                                root->root_key.objectid,
                                                new_key.objectid,
                                                start - extent_offset, 0);
                                BUG_ON(ret); /* -ENOMEM */
-                               *hint_byte = disk_bytenr;
                        }
                        key.offset = start;
                }
@@ -734,10 +743,8 @@ next_slot:
                        btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        extent_end - end);
                        btrfs_mark_buffer_dirty(leaf);
-                       if (disk_bytenr > 0) {
+                       if (update_refs && disk_bytenr > 0)
                                inode_sub_bytes(inode, end - key.offset);
-                               *hint_byte = disk_bytenr;
-                       }
                        break;
                }
 
@@ -753,10 +760,8 @@ next_slot:
                        btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        start - key.offset);
                        btrfs_mark_buffer_dirty(leaf);
-                       if (disk_bytenr > 0) {
+                       if (update_refs && disk_bytenr > 0)
                                inode_sub_bytes(inode, extent_end - start);
-                               *hint_byte = disk_bytenr;
-                       }
                        if (end == extent_end)
                                break;
 
@@ -777,12 +782,13 @@ next_slot:
                                del_nr++;
                        }
 
-                       if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                       if (update_refs &&
+                           extent_type == BTRFS_FILE_EXTENT_INLINE) {
                                inode_sub_bytes(inode,
                                                extent_end - key.offset);
                                extent_end = ALIGN(extent_end,
                                                   root->sectorsize);
-                       } else if (disk_bytenr > 0) {
+                       } else if (update_refs && disk_bytenr > 0) {
                                ret = btrfs_free_extent(trans, root,
                                                disk_bytenr, num_bytes, 0,
                                                root->root_key.objectid,
@@ -791,7 +797,6 @@ next_slot:
                                BUG_ON(ret); /* -ENOMEM */
                                inode_sub_bytes(inode,
                                                extent_end - key.offset);
-                               *hint_byte = disk_bytenr;
                        }
 
                        if (end == extent_end)
@@ -806,7 +811,7 @@ next_slot:
                                              del_nr);
                        if (ret) {
                                btrfs_abort_transaction(trans, root, ret);
-                               goto out;
+                               break;
                        }
 
                        del_nr = 0;
@@ -825,7 +830,24 @@ next_slot:
                        btrfs_abort_transaction(trans, root, ret);
        }
 
-out:
+       if (drop_end)
+               *drop_end = found ? min(end, extent_end) : end;
+       btrfs_release_path(path);
+       return ret;
+}
+
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root, struct inode *inode, u64 start,
+                      u64 end, int drop_cache)
+{
+       struct btrfs_path *path;
+       int ret;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+       ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
+                                  drop_cache);
        btrfs_free_path(path);
        return ret;
 }
@@ -892,8 +914,6 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
        int ret;
        u64 ino = btrfs_ino(inode);
 
-       btrfs_drop_extent_cache(inode, start, end - 1, 0);
-
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -935,12 +955,16 @@ again:
                        btrfs_set_item_key_safe(trans, root, path, &new_key);
                        fi = btrfs_item_ptr(leaf, path->slots[0],
                                            struct btrfs_file_extent_item);
+                       btrfs_set_file_extent_generation(leaf, fi,
+                                                        trans->transid);
                        btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        extent_end - end);
                        btrfs_set_file_extent_offset(leaf, fi,
                                                     end - orig_offset);
                        fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
                                            struct btrfs_file_extent_item);
+                       btrfs_set_file_extent_generation(leaf, fi,
+                                                        trans->transid);
                        btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        end - other_start);
                        btrfs_mark_buffer_dirty(leaf);
@@ -958,12 +982,16 @@ again:
                                            struct btrfs_file_extent_item);
                        btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        start - key.offset);
+                       btrfs_set_file_extent_generation(leaf, fi,
+                                                        trans->transid);
                        path->slots[0]++;
                        new_key.offset = start;
                        btrfs_set_item_key_safe(trans, root, path, &new_key);
 
                        fi = btrfs_item_ptr(leaf, path->slots[0],
                                            struct btrfs_file_extent_item);
+                       btrfs_set_file_extent_generation(leaf, fi,
+                                                        trans->transid);
                        btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        other_end - start);
                        btrfs_set_file_extent_offset(leaf, fi,
@@ -991,12 +1019,14 @@ again:
                leaf = path->nodes[0];
                fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
                                    struct btrfs_file_extent_item);
+               btrfs_set_file_extent_generation(leaf, fi, trans->transid);
                btrfs_set_file_extent_num_bytes(leaf, fi,
                                                split - key.offset);
 
                fi = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_file_extent_item);
 
+               btrfs_set_file_extent_generation(leaf, fi, trans->transid);
                btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
                btrfs_set_file_extent_num_bytes(leaf, fi,
                                                extent_end - split);
@@ -1056,12 +1086,14 @@ again:
                           struct btrfs_file_extent_item);
                btrfs_set_file_extent_type(leaf, fi,
                                           BTRFS_FILE_EXTENT_REG);
+               btrfs_set_file_extent_generation(leaf, fi, trans->transid);
                btrfs_mark_buffer_dirty(leaf);
        } else {
                fi = btrfs_item_ptr(leaf, del_slot - 1,
                           struct btrfs_file_extent_item);
                btrfs_set_file_extent_type(leaf, fi,
                                           BTRFS_FILE_EXTENT_REG);
+               btrfs_set_file_extent_generation(leaf, fi, trans->transid);
                btrfs_set_file_extent_num_bytes(leaf, fi,
                                                extent_end - key.offset);
                btrfs_mark_buffer_dirty(leaf);
@@ -1173,8 +1205,8 @@ again:
 
                clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
                                  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
-                                 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
-                                 GFP_NOFS);
+                                 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+                                 0, 0, &cached_state, GFP_NOFS);
                unlock_extent_cached(&BTRFS_I(inode)->io_tree,
                                     start_pos, last_pos - 1, &cached_state,
                                     GFP_NOFS);
@@ -1514,16 +1546,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
        trace_btrfs_sync_file(file, datasync);
 
+       /*
+        * We write the dirty pages in the range and wait until they complete
+        * out of the ->i_mutex. If so, we can flush the dirty pages by
+        * multi-task, and make the performance up.
+        */
+       ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+       if (ret)
+               return ret;
+
        mutex_lock(&inode->i_mutex);
 
        /*
-        * we wait first, since the writeback may change the inode, also wait
-        * ordered range does a filemape_write_and_wait_range which is why we
-        * don't do it above like other file systems.
+        * We flush the dirty pages again to avoid some dirty pages in the
+        * range being left.
         */
-       root->log_batch++;
+       atomic_inc(&root->log_batch);
        btrfs_wait_ordered_range(inode, start, end);
-       root->log_batch++;
+       atomic_inc(&root->log_batch);
 
        /*
         * check the transaction that last modified this inode
@@ -1544,6 +1584,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
            BTRFS_I(inode)->last_trans <=
            root->fs_info->last_trans_committed) {
                BTRFS_I(inode)->last_trans = 0;
+
+               /*
+                * We'v had everything committed since the last time we were
+                * modified so clear this flag in case it was set for whatever
+                * reason, it's no longer relevant.
+                */
+               clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                         &BTRFS_I(inode)->runtime_flags);
                mutex_unlock(&inode->i_mutex);
                goto out;
        }
@@ -1615,6 +1663,324 @@ static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
        return 0;
 }
 
+static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
+                         int slot, u64 start, u64 end)
+{
+       struct btrfs_file_extent_item *fi;
+       struct btrfs_key key;
+
+       if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+               return 0;
+
+       btrfs_item_key_to_cpu(leaf, &key, slot);
+       if (key.objectid != btrfs_ino(inode) ||
+           key.type != BTRFS_EXTENT_DATA_KEY)
+               return 0;
+
+       fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+       if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+               return 0;
+
+       if (btrfs_file_extent_disk_bytenr(leaf, fi))
+               return 0;
+
+       if (key.offset == end)
+               return 1;
+       if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
+               return 1;
+       return 0;
+}
+
+static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
+                     struct btrfs_path *path, u64 offset, u64 end)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct extent_buffer *leaf;
+       struct btrfs_file_extent_item *fi;
+       struct extent_map *hole_em;
+       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       struct btrfs_key key;
+       int ret;
+
+       key.objectid = btrfs_ino(inode);
+       key.type = BTRFS_EXTENT_DATA_KEY;
+       key.offset = offset;
+
+
+       ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+       if (ret < 0)
+               return ret;
+       BUG_ON(!ret);
+
+       leaf = path->nodes[0];
+       if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
+               u64 num_bytes;
+
+               path->slots[0]--;
+               fi = btrfs_item_ptr(leaf, path->slots[0],
+                                   struct btrfs_file_extent_item);
+               num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
+                       end - offset;
+               btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+               btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+               btrfs_set_file_extent_offset(leaf, fi, 0);
+               btrfs_mark_buffer_dirty(leaf);
+               goto out;
+       }
+
+       if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
+               u64 num_bytes;
+
+               path->slots[0]++;
+               key.offset = offset;
+               btrfs_set_item_key_safe(trans, root, path, &key);
+               fi = btrfs_item_ptr(leaf, path->slots[0],
+                                   struct btrfs_file_extent_item);
+               num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
+                       offset;
+               btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+               btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+               btrfs_set_file_extent_offset(leaf, fi, 0);
+               btrfs_mark_buffer_dirty(leaf);
+               goto out;
+       }
+       btrfs_release_path(path);
+
+       ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+                                      0, 0, end - offset, 0, end - offset,
+                                      0, 0, 0);
+       if (ret)
+               return ret;
+
+out:
+       btrfs_release_path(path);
+
+       hole_em = alloc_extent_map();
+       if (!hole_em) {
+               btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+               set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                       &BTRFS_I(inode)->runtime_flags);
+       } else {
+               hole_em->start = offset;
+               hole_em->len = end - offset;
+               hole_em->orig_start = offset;
+
+               hole_em->block_start = EXTENT_MAP_HOLE;
+               hole_em->block_len = 0;
+               hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+               hole_em->compress_type = BTRFS_COMPRESS_NONE;
+               hole_em->generation = trans->transid;
+
+               do {
+                       btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+                       write_lock(&em_tree->lock);
+                       ret = add_extent_mapping(em_tree, hole_em);
+                       if (!ret)
+                               list_move(&hole_em->list,
+                                         &em_tree->modified_extents);
+                       write_unlock(&em_tree->lock);
+               } while (ret == -EEXIST);
+               free_extent_map(hole_em);
+               if (ret)
+                       set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                               &BTRFS_I(inode)->runtime_flags);
+       }
+
+       return 0;
+}
+
+static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct extent_state *cached_state = NULL;
+       struct btrfs_path *path;
+       struct btrfs_block_rsv *rsv;
+       struct btrfs_trans_handle *trans;
+       u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+       u64 lockstart = (offset + mask) & ~mask;
+       u64 lockend = ((offset + len) & ~mask) - 1;
+       u64 cur_offset = lockstart;
+       u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+       u64 drop_end;
+       unsigned long nr;
+       int ret = 0;
+       int err = 0;
+       bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
+               ((offset + len) >> PAGE_CACHE_SHIFT);
+
+       btrfs_wait_ordered_range(inode, offset, len);
+
+       mutex_lock(&inode->i_mutex);
+       if (offset >= inode->i_size) {
+               mutex_unlock(&inode->i_mutex);
+               return 0;
+       }
+
+       /*
+        * Only do this if we are in the same page and we aren't doing the
+        * entire page.
+        */
+       if (same_page && len < PAGE_CACHE_SIZE) {
+               ret = btrfs_truncate_page(inode, offset, len, 0);
+               mutex_unlock(&inode->i_mutex);
+               return ret;
+       }
+
+       /* zero back part of the first page */
+       ret = btrfs_truncate_page(inode, offset, 0, 0);
+       if (ret) {
+               mutex_unlock(&inode->i_mutex);
+               return ret;
+       }
+
+       /* zero the front end of the last page */
+       ret = btrfs_truncate_page(inode, offset + len, 0, 1);
+       if (ret) {
+               mutex_unlock(&inode->i_mutex);
+               return ret;
+       }
+
+       if (lockend < lockstart) {
+               mutex_unlock(&inode->i_mutex);
+               return 0;
+       }
+
+       while (1) {
+               struct btrfs_ordered_extent *ordered;
+
+               truncate_pagecache_range(inode, lockstart, lockend);
+
+               lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                0, &cached_state);
+               ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+               /*
+                * We need to make sure we have no ordered extents in this range
+                * and nobody raced in and read a page in this range, if we did
+                * we need to try again.
+                */
+               if ((!ordered ||
+                   (ordered->file_offset + ordered->len < lockstart ||
+                    ordered->file_offset > lockend)) &&
+                    !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                    lockend, EXTENT_UPTODATE, 0,
+                                    cached_state)) {
+                       if (ordered)
+                               btrfs_put_ordered_extent(ordered);
+                       break;
+               }
+               if (ordered)
+                       btrfs_put_ordered_extent(ordered);
+               unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+                                    lockend, &cached_state, GFP_NOFS);
+               btrfs_wait_ordered_range(inode, lockstart,
+                                        lockend - lockstart + 1);
+       }
+
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+       if (!rsv) {
+               ret = -ENOMEM;
+               goto out_free;
+       }
+       rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
+       rsv->failfast = 1;
+
+       /*
+        * 1 - update the inode
+        * 1 - removing the extents in the range
+        * 1 - adding the hole extent
+        */
+       trans = btrfs_start_transaction(root, 3);
+       if (IS_ERR(trans)) {
+               err = PTR_ERR(trans);
+               goto out_free;
+       }
+
+       ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+                                     min_size);
+       BUG_ON(ret);
+       trans->block_rsv = rsv;
+
+       while (cur_offset < lockend) {
+               ret = __btrfs_drop_extents(trans, root, inode, path,
+                                          cur_offset, lockend + 1,
+                                          &drop_end, 1);
+               if (ret != -ENOSPC)
+                       break;
+
+               trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+               ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+               if (ret) {
+                       err = ret;
+                       break;
+               }
+
+               cur_offset = drop_end;
+
+               ret = btrfs_update_inode(trans, root, inode);
+               if (ret) {
+                       err = ret;
+                       break;
+               }
+
+               nr = trans->blocks_used;
+               btrfs_end_transaction(trans, root);
+               btrfs_btree_balance_dirty(root, nr);
+
+               trans = btrfs_start_transaction(root, 3);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       trans = NULL;
+                       break;
+               }
+
+               ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+                                             rsv, min_size);
+               BUG_ON(ret);    /* shouldn't happen */
+               trans->block_rsv = rsv;
+       }
+
+       if (ret) {
+               err = ret;
+               goto out_trans;
+       }
+
+       trans->block_rsv = &root->fs_info->trans_block_rsv;
+       ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+       if (ret) {
+               err = ret;
+               goto out_trans;
+       }
+
+out_trans:
+       if (!trans)
+               goto out_free;
+
+       trans->block_rsv = &root->fs_info->trans_block_rsv;
+       ret = btrfs_update_inode(trans, root, inode);
+       nr = trans->blocks_used;
+       btrfs_end_transaction(trans, root);
+       btrfs_btree_balance_dirty(root, nr);
+out_free:
+       btrfs_free_path(path);
+       btrfs_free_block_rsv(root, rsv);
+out:
+       unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                            &cached_state, GFP_NOFS);
+       mutex_unlock(&inode->i_mutex);
+       if (ret && !err)
+               err = ret;
+       return err;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
                            loff_t offset, loff_t len)
 {
@@ -1633,15 +1999,18 @@ static long btrfs_fallocate(struct file *file, int mode,
        alloc_start = offset & ~mask;
        alloc_end =  (offset + len + mask) & ~mask;
 
-       /* We only support the FALLOC_FL_KEEP_SIZE mode */
-       if (mode & ~FALLOC_FL_KEEP_SIZE)
+       /* Make sure we aren't being give some crap mode */
+       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
                return -EOPNOTSUPP;
 
+       if (mode & FALLOC_FL_PUNCH_HOLE)
+               return btrfs_punch_hole(inode, offset, len);
+
        /*
         * Make sure we have enough space before we do the
         * allocation.
         */
-       ret = btrfs_check_data_free_space(inode, len);
+       ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
        if (ret)
                return ret;
 
@@ -1748,7 +2117,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 out:
        mutex_unlock(&inode->i_mutex);
        /* Let go of our reservation. */
-       btrfs_free_reserved_data_space(inode, len);
+       btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
        return ret;
 }
 
index 6b10acfc2f5cae609fca043caba43f0d57b60dd2..1027b854b90cec02b9d2328804bc12f23f9bc00b 100644 (file)
@@ -966,7 +966,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                               block_group->key.offset)) {
                ret = find_first_extent_bit(unpin, start,
                                            &extent_start, &extent_end,
-                                           EXTENT_DIRTY);
+                                           EXTENT_DIRTY, NULL);
                if (ret) {
                        ret = 0;
                        break;
@@ -1454,9 +1454,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
                          max_t(u64, *offset, bitmap_info->offset));
        bits = bytes_to_bits(*bytes, ctl->unit);
 
-       for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
-            i < BITS_PER_BITMAP;
-            i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
+       for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
                next_zero = find_next_zero_bit(bitmap_info->bitmap,
                                               BITS_PER_BITMAP, i);
                if ((next_zero - i) >= bits) {
@@ -2307,9 +2305,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 
 again:
        found_bits = 0;
-       for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
-            i < BITS_PER_BITMAP;
-            i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
+       for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
                next_zero = find_next_zero_bit(entry->bitmap,
                                               BITS_PER_BITMAP, i);
                if (next_zero - i >= min_bits) {
index db2ff9773b99d02300e84f4bec207f59c65e73ef..1d982812ab6761077673985dea55970f418cedf7 100644 (file)
@@ -24,4 +24,14 @@ static inline u64 btrfs_name_hash(const char *name, int len)
 {
        return crc32c((u32)~1, name, len);
 }
+
+/*
+ * Figure the key offset of an extended inode ref
+ */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
+                                   int len)
+{
+       return (u64) crc32c(parent_objectid, name, len);
+}
+
 #endif
index a13cf1a96c73ca00f4baa453f864029c03525fe4..48b8fda93132b799b695efc4f1bd9e59cca1f064 100644 (file)
@@ -18,6 +18,7 @@
 
 #include "ctree.h"
 #include "disk-io.h"
+#include "hash.h"
 #include "transaction.h"
 #include "print-tree.h"
 
@@ -50,18 +51,57 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
        return 0;
 }
 
-struct btrfs_inode_ref *
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
+                                  const char *name, int name_len,
+                                  struct btrfs_inode_extref **extref_ret)
+{
+       struct extent_buffer *leaf;
+       struct btrfs_inode_extref *extref;
+       unsigned long ptr;
+       unsigned long name_ptr;
+       u32 item_size;
+       u32 cur_offset = 0;
+       int ref_name_len;
+
+       leaf = path->nodes[0];
+       item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+       ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+       /*
+        * Search all extended backrefs in this item. We're only
+        * looking through any collisions so most of the time this is
+        * just going to compare against one buffer. If all is well,
+        * we'll return success and the inode ref object.
+        */
+       while (cur_offset < item_size) {
+               extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+               name_ptr = (unsigned long)(&extref->name);
+               ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+               if (ref_name_len == name_len &&
+                   btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
+                   (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) {
+                       if (extref_ret)
+                               *extref_ret = extref;
+                       return 1;
+               }
+
+               cur_offset += ref_name_len + sizeof(*extref);
+       }
+       return 0;
+}
+
+static struct btrfs_inode_ref *
 btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root,
-                       struct btrfs_path *path,
-                       const char *name, int name_len,
-                       u64 inode_objectid, u64 ref_objectid, int mod)
+                      struct btrfs_root *root,
+                      struct btrfs_path *path,
+                      const char *name, int name_len,
+                      u64 inode_objectid, u64 ref_objectid, int ins_len,
+                      int cow)
 {
+       int ret;
        struct btrfs_key key;
        struct btrfs_inode_ref *ref;
-       int ins_len = mod < 0 ? -1 : 0;
-       int cow = mod != 0;
-       int ret;
 
        key.objectid = inode_objectid;
        key.type = BTRFS_INODE_REF_KEY;
@@ -77,10 +117,147 @@ btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
        return ref;
 }
 
-int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+/* Returns NULL if no extref found */
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         struct btrfs_path *path,
+                         const char *name, int name_len,
+                         u64 inode_objectid, u64 ref_objectid, int ins_len,
+                         int cow)
+{
+       int ret;
+       struct btrfs_key key;
+       struct btrfs_inode_extref *extref;
+
+       key.objectid = inode_objectid;
+       key.type = BTRFS_INODE_EXTREF_KEY;
+       key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+       ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+       if (ret < 0)
+               return ERR_PTR(ret);
+       if (ret > 0)
+               return NULL;
+       if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref))
+               return NULL;
+       return extref;
+}
+
+int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct btrfs_path *path,
+                             const char *name, int name_len,
+                             u64 inode_objectid, u64 ref_objectid, int mod,
+                             u64 *ret_index)
+{
+       struct btrfs_inode_ref *ref;
+       struct btrfs_inode_extref *extref;
+       int ins_len = mod < 0 ? -1 : 0;
+       int cow = mod != 0;
+
+       ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len,
+                                    inode_objectid, ref_objectid, ins_len,
+                                    cow);
+       if (IS_ERR(ref))
+               return PTR_ERR(ref);
+
+       if (ref != NULL) {
+               *ret_index = btrfs_inode_ref_index(path->nodes[0], ref);
+               return 0;
+       }
+
+       btrfs_release_path(path);
+
+       extref = btrfs_lookup_inode_extref(trans, root, path, name,
+                                          name_len, inode_objectid,
+                                          ref_objectid, ins_len, cow);
+       if (IS_ERR(extref))
+               return PTR_ERR(extref);
+
+       if (extref) {
+               *ret_index = btrfs_inode_extref_index(path->nodes[0], extref);
+               return 0;
+       }
+
+       return -ENOENT;
+}
+
+int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           const char *name, int name_len,
                           u64 inode_objectid, u64 ref_objectid, u64 *index)
+{
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct btrfs_inode_extref *extref;
+       struct extent_buffer *leaf;
+       int ret;
+       int del_len = name_len + sizeof(*extref);
+       unsigned long ptr;
+       unsigned long item_start;
+       u32 item_size;
+
+       key.objectid = inode_objectid;
+       btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+       key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       path->leave_spinning = 1;
+
+       ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+       if (ret > 0)
+               ret = -ENOENT;
+       if (ret < 0)
+               goto out;
+
+       /*
+        * Sanity check - did we find the right item for this name?
+        * This should always succeed so error here will make the FS
+        * readonly.
+        */
+       if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
+                                           name, name_len, &extref)) {
+               btrfs_std_error(root->fs_info, -ENOENT);
+               ret = -EROFS;
+               goto out;
+       }
+
+       leaf = path->nodes[0];
+       item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+       if (index)
+               *index = btrfs_inode_extref_index(leaf, extref);
+
+       if (del_len == item_size) {
+               /*
+                * Common case only one ref in the item, remove the
+                * whole item.
+                */
+               ret = btrfs_del_item(trans, root, path);
+               goto out;
+       }
+
+       ptr = (unsigned long)extref;
+       item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+       memmove_extent_buffer(leaf, ptr, ptr + del_len,
+                             item_size - (ptr + del_len - item_start));
+
+       btrfs_truncate_item(trans, root, path, item_size - del_len, 1);
+
+out:
+       btrfs_free_path(path);
+
+       return ret;
+}
+
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root,
+                       const char *name, int name_len,
+                       u64 inode_objectid, u64 ref_objectid, u64 *index)
 {
        struct btrfs_path *path;
        struct btrfs_key key;
@@ -91,6 +268,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
        u32 item_size;
        u32 sub_item_len;
        int ret;
+       int search_ext_refs = 0;
        int del_len = name_len + sizeof(*ref);
 
        key.objectid = inode_objectid;
@@ -106,12 +284,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret > 0) {
                ret = -ENOENT;
+               search_ext_refs = 1;
                goto out;
        } else if (ret < 0) {
                goto out;
        }
        if (!find_name_in_backref(path, name, name_len, &ref)) {
                ret = -ENOENT;
+               search_ext_refs = 1;
                goto out;
        }
        leaf = path->nodes[0];
@@ -129,8 +309,78 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
        item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
        memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
                              item_size - (ptr + sub_item_len - item_start));
-       btrfs_truncate_item(trans, root, path,
-                                 item_size - sub_item_len, 1);
+       btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1);
+out:
+       btrfs_free_path(path);
+
+       if (search_ext_refs) {
+               /*
+                * No refs were found, or we could not find the
+                * name in our ref array. Find and remove the extended
+                * inode ref then.
+                */
+               return btrfs_del_inode_extref(trans, root, name, name_len,
+                                             inode_objectid, ref_objectid, index);
+       }
+
+       return ret;
+}
+
+/*
+ * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
+ *
+ * The caller must have checked against BTRFS_LINK_MAX already.
+ */
+static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    const char *name, int name_len,
+                                    u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+       struct btrfs_inode_extref *extref;
+       int ret;
+       int ins_len = name_len + sizeof(*extref);
+       unsigned long ptr;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct extent_buffer *leaf;
+       struct btrfs_item *item;
+
+       key.objectid = inode_objectid;
+       key.type = BTRFS_INODE_EXTREF_KEY;
+       key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       path->leave_spinning = 1;
+       ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                     ins_len);
+       if (ret == -EEXIST) {
+               if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+                                                  name, name_len, NULL))
+                       goto out;
+
+               btrfs_extend_item(trans, root, path, ins_len);
+               ret = 0;
+       }
+       if (ret < 0)
+               goto out;
+
+       leaf = path->nodes[0];
+       item = btrfs_item_nr(leaf, path->slots[0]);
+       ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
+       ptr += btrfs_item_size(leaf, item) - ins_len;
+       extref = (struct btrfs_inode_extref *)ptr;
+
+       btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
+       btrfs_set_inode_extref_index(path->nodes[0], extref, index);
+       btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
+
+       ptr = (unsigned long)&extref->name;
+       write_extent_buffer(path->nodes[0], name, ptr, name_len);
+       btrfs_mark_buffer_dirty(path->nodes[0]);
+
 out:
        btrfs_free_path(path);
        return ret;
@@ -191,6 +441,19 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 
 out:
        btrfs_free_path(path);
+
+       if (ret == -EMLINK) {
+               struct btrfs_super_block *disk_super = root->fs_info->super_copy;
+               /* We ran out of space in the ref array. Need to
+                * add an extended ref. */
+               if (btrfs_super_incompat_flags(disk_super)
+                   & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+                       ret = btrfs_insert_inode_extref(trans, root, name,
+                                                       name_len,
+                                                       inode_objectid,
+                                                       ref_objectid, index);
+       }
+
        return ret;
 }
 
index a6ed6944e50c42eb67a659c1eda6a5f17650d97e..85a1e5053fe63a9d8df6682da38198883e00bb22 100644 (file)
@@ -230,7 +230,6 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
        u64 inline_len = actual_end - start;
        u64 aligned_end = (end + root->sectorsize - 1) &
                        ~((u64)root->sectorsize - 1);
-       u64 hint_byte;
        u64 data_len = inline_len;
        int ret;
 
@@ -247,8 +246,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                return 1;
        }
 
-       ret = btrfs_drop_extents(trans, inode, start, aligned_end,
-                                &hint_byte, 1);
+       ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
        if (ret)
                return ret;
 
@@ -664,7 +662,7 @@ retry:
                                           async_extent->compressed_size,
                                           async_extent->compressed_size,
                                           0, alloc_hint, &ins, 1);
-                       if (ret)
+                       if (ret && ret != -ENOSPC)
                                btrfs_abort_transaction(trans, root, ret);
                        btrfs_end_transaction(trans, root);
                }
@@ -1308,6 +1306,7 @@ out_check:
                        em->block_start = disk_bytenr;
                        em->bdev = root->fs_info->fs_devices->latest_bdev;
                        set_bit(EXTENT_FLAG_PINNED, &em->flags);
+                       set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
                        while (1) {
                                write_lock(&em_tree->lock);
                                ret = add_extent_mapping(em_tree, em);
@@ -1364,11 +1363,7 @@ out_check:
        }
 
 error:
-       if (nolock) {
-               err = btrfs_end_transaction_nolock(trans, root);
-       } else {
-               err = btrfs_end_transaction(trans, root);
-       }
+       err = btrfs_end_transaction(trans, root);
        if (!ret)
                ret = err;
 
@@ -1785,7 +1780,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_key ins;
-       u64 hint;
        int ret;
 
        path = btrfs_alloc_path();
@@ -1803,8 +1797,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
         * the caller is expected to unpin it and allow it to be merged
         * with the others.
         */
-       ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
-                                &hint, 0);
+       ret = btrfs_drop_extents(trans, root, inode, file_pos,
+                                file_pos + num_bytes, 0);
        if (ret)
                goto out;
 
@@ -1828,10 +1822,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        btrfs_set_file_extent_encryption(leaf, fi, encryption);
        btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
 
-       btrfs_unlock_up_safe(path, 1);
-       btrfs_set_lock_blocking(leaf);
-
        btrfs_mark_buffer_dirty(leaf);
+       btrfs_release_path(path);
 
        inode_add_bytes(inode, num_bytes);
 
@@ -1929,11 +1921,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                                                ordered_extent->len,
                                                compress_type, 0, 0,
                                                BTRFS_FILE_EXTENT_REG);
-               unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
-                                  ordered_extent->file_offset,
-                                  ordered_extent->len);
        }
-
+       unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+                          ordered_extent->file_offset, ordered_extent->len,
+                          trans->transid);
        if (ret < 0) {
                btrfs_abort_transaction(trans, root, ret);
                goto out_unlock;
@@ -1949,6 +1940,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                        btrfs_abort_transaction(trans, root, ret);
                        goto out_unlock;
                }
+       } else {
+               btrfs_set_inode_last_trans(trans, inode);
        }
        ret = 0;
 out_unlock:
@@ -1958,12 +1951,8 @@ out_unlock:
 out:
        if (root != root->fs_info->tree_root)
                btrfs_delalloc_release_metadata(inode, ordered_extent->len);
-       if (trans) {
-               if (nolock)
-                       btrfs_end_transaction_nolock(trans, root);
-               else
-                       btrfs_end_transaction(trans, root);
-       }
+       if (trans)
+               btrfs_end_transaction(trans, root);
 
        if (ret)
                clear_extent_uptodate(io_tree, ordered_extent->file_offset,
@@ -2119,7 +2108,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
        if (empty)
                return;
 
-       down_read(&root->fs_info->cleanup_work_sem);
        spin_lock(&fs_info->delayed_iput_lock);
        list_splice_init(&fs_info->delayed_iputs, &list);
        spin_unlock(&fs_info->delayed_iput_lock);
@@ -2130,7 +2118,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
                iput(delayed->inode);
                kfree(delayed);
        }
-       up_read(&root->fs_info->cleanup_work_sem);
 }
 
 enum btrfs_orphan_cleanup_state {
@@ -2198,7 +2185,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
        int ret;
 
        if (!root->orphan_block_rsv) {
-               block_rsv = btrfs_alloc_block_rsv(root);
+               block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
                if (!block_rsv)
                        return -ENOMEM;
        }
@@ -2225,7 +2212,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
                        insert = 1;
 #endif
                insert = 1;
-               atomic_dec(&root->orphan_inodes);
+               atomic_inc(&root->orphan_inodes);
        }
 
        if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
@@ -2590,6 +2577,18 @@ static void btrfs_read_locked_inode(struct inode *inode)
 
        inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+       BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+
+       /*
+        * If we were modified in the current generation and evicted from memory
+        * and then re-read we need to do a full sync since we don't have any
+        * idea about which extents were modified before we were evicted from
+        * cache.
+        */
+       if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
+               set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                       &BTRFS_I(inode)->runtime_flags);
+
        inode->i_version = btrfs_inode_sequence(leaf, inode_item);
        inode->i_generation = BTRFS_I(inode)->generation;
        inode->i_rdev = 0;
@@ -2894,7 +2893,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_path *path;
-       struct btrfs_inode_ref *ref;
        struct btrfs_dir_item *di;
        struct inode *inode = dentry->d_inode;
        u64 index;
@@ -3008,17 +3006,17 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        }
        btrfs_release_path(path);
 
-       ref = btrfs_lookup_inode_ref(trans, root, path,
-                               dentry->d_name.name, dentry->d_name.len,
-                               ino, dir_ino, 0);
-       if (IS_ERR(ref)) {
-               err = PTR_ERR(ref);
+       ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
+                                       dentry->d_name.len, ino, dir_ino, 0,
+                                       &index);
+       if (ret) {
+               err = ret;
                goto out;
        }
-       BUG_ON(!ref); /* Logic error */
+
        if (check_path_shared(root, path))
                goto out;
-       index = btrfs_inode_ref_index(path->nodes[0], ref);
+
        btrfs_release_path(path);
 
        /*
@@ -3061,7 +3059,7 @@ out:
 static void __unlink_end_trans(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root)
 {
-       if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+       if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
                btrfs_block_rsv_release(root, trans->block_rsv,
                                        trans->bytes_reserved);
                trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3191,9 +3189,10 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        struct btrfs_trans_handle *trans;
        unsigned long nr = 0;
 
-       if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
-           btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
+       if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;
+       if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
+               return -EPERM;
 
        trans = __unlink_start_trans(dir, dentry);
        if (IS_ERR(trans))
@@ -3267,8 +3266,13 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        path->reada = -1;
 
+       /*
+        * We want to drop from the next block forward in case this new size is
+        * not block aligned since we will be keeping the last block of the
+        * extent just the way it is.
+        */
        if (root->ref_cows || root == root->fs_info->tree_root)
-               btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+               btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);
 
        /*
         * This function is also used to drop the items in the log tree before
@@ -3429,12 +3433,6 @@ delete:
 
                if (path->slots[0] == 0 ||
                    path->slots[0] != pending_del_slot) {
-                       if (root->ref_cows &&
-                           BTRFS_I(inode)->location.objectid !=
-                                               BTRFS_FREE_INO_OBJECTID) {
-                               err = -EAGAIN;
-                               goto out;
-                       }
                        if (pending_del_nr) {
                                ret = btrfs_del_items(trans, root, path,
                                                pending_del_slot,
@@ -3465,12 +3463,20 @@ error:
 }
 
 /*
- * taken from block_truncate_page, but does cow as it zeros out
- * any bytes left in the last page in the file.
+ * btrfs_truncate_page - read, zero a chunk and write a page
+ * @inode - inode that we're zeroing
+ * @from - the offset to start zeroing
+ * @len - the length to zero, 0 to zero the entire range respective to the
+ *     offset
+ * @front - zero up to the offset instead of from the offset on
+ *
+ * This will find the page for the "from" offset and cow the page and zero the
+ * part we want to zero.  This is used with truncate and hole punching.
  */
-static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+                       int front)
 {
-       struct inode *inode = mapping->host;
+       struct address_space *mapping = inode->i_mapping;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_ordered_extent *ordered;
@@ -3485,7 +3491,8 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
        u64 page_start;
        u64 page_end;
 
-       if ((offset & (blocksize - 1)) == 0)
+       if ((offset & (blocksize - 1)) == 0 &&
+           (!len || ((len & (blocksize - 1)) == 0)))
                goto out;
        ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
        if (ret)
@@ -3532,7 +3539,8 @@ again:
        }
 
        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-                         EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+                         EXTENT_DIRTY | EXTENT_DELALLOC |
+                         EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
                          0, 0, &cached_state, GFP_NOFS);
 
        ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -3545,8 +3553,13 @@ again:
 
        ret = 0;
        if (offset != PAGE_CACHE_SIZE) {
+               if (!len)
+                       len = PAGE_CACHE_SIZE - offset;
                kaddr = kmap(page);
-               memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+               if (front)
+                       memset(kaddr, 0, offset);
+               else
+                       memset(kaddr + offset, 0, len);
                flush_dcache_page(page);
                kunmap(page);
        }
@@ -3577,6 +3590,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
+       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        u64 mask = root->sectorsize - 1;
        u64 hole_start = (oldsize + mask) & ~mask;
        u64 block_end = (size + mask) & ~mask;
@@ -3613,7 +3627,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                last_byte = min(extent_map_end(em), block_end);
                last_byte = (last_byte + mask) & ~mask;
                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
-                       u64 hint_byte = 0;
+                       struct extent_map *hole_em;
                        hole_size = last_byte - cur_offset;
 
                        trans = btrfs_start_transaction(root, 3);
@@ -3622,9 +3636,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                break;
                        }
 
-                       err = btrfs_drop_extents(trans, inode, cur_offset,
-                                                cur_offset + hole_size,
-                                                &hint_byte, 1);
+                       err = btrfs_drop_extents(trans, root, inode,
+                                                cur_offset,
+                                                cur_offset + hole_size, 1);
                        if (err) {
                                btrfs_abort_transaction(trans, root, err);
                                btrfs_end_transaction(trans, root);
@@ -3641,9 +3655,39 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                break;
                        }
 
-                       btrfs_drop_extent_cache(inode, hole_start,
-                                       last_byte - 1, 0);
+                       btrfs_drop_extent_cache(inode, cur_offset,
+                                               cur_offset + hole_size - 1, 0);
+                       hole_em = alloc_extent_map();
+                       if (!hole_em) {
+                               set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                                       &BTRFS_I(inode)->runtime_flags);
+                               goto next;
+                       }
+                       hole_em->start = cur_offset;
+                       hole_em->len = hole_size;
+                       hole_em->orig_start = cur_offset;
 
+                       hole_em->block_start = EXTENT_MAP_HOLE;
+                       hole_em->block_len = 0;
+                       hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+                       hole_em->compress_type = BTRFS_COMPRESS_NONE;
+                       hole_em->generation = trans->transid;
+
+                       while (1) {
+                               write_lock(&em_tree->lock);
+                               err = add_extent_mapping(em_tree, hole_em);
+                               if (!err)
+                                       list_move(&hole_em->list,
+                                                 &em_tree->modified_extents);
+                               write_unlock(&em_tree->lock);
+                               if (err != -EEXIST)
+                                       break;
+                               btrfs_drop_extent_cache(inode, cur_offset,
+                                                       cur_offset +
+                                                       hole_size - 1, 0);
+                       }
+                       free_extent_map(hole_em);
+next:
                        btrfs_update_inode(trans, root, inode);
                        btrfs_end_transaction(trans, root);
                }
@@ -3768,26 +3812,22 @@ void btrfs_evict_inode(struct inode *inode)
                goto no_delete;
        }
 
-       rsv = btrfs_alloc_block_rsv(root);
+       rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
        if (!rsv) {
                btrfs_orphan_del(NULL, inode);
                goto no_delete;
        }
        rsv->size = min_size;
+       rsv->failfast = 1;
        global_rsv = &root->fs_info->global_block_rsv;
 
        btrfs_i_size_write(inode, 0);
 
        /*
-        * This is a bit simpler than btrfs_truncate since
-        *
-        * 1) We've already reserved our space for our orphan item in the
-        *    unlink.
-        * 2) We're going to delete the inode item, so we don't need to update
-        *    it at all.
-        *
-        * So we just need to reserve some slack space in case we add bytes when
-        * doing the truncate.
+        * This is a bit simpler than btrfs_truncate since we've already
+        * reserved our space for our orphan item in the unlink, so we just
+        * need to reserve some slack space in case we add bytes and update
+        * inode item when doing the truncate.
         */
        while (1) {
                ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
@@ -3808,7 +3848,7 @@ void btrfs_evict_inode(struct inode *inode)
                        goto no_delete;
                }
 
-               trans = btrfs_start_transaction(root, 0);
+               trans = btrfs_start_transaction_noflush(root, 1);
                if (IS_ERR(trans)) {
                        btrfs_orphan_del(NULL, inode);
                        btrfs_free_block_rsv(root, rsv);
@@ -3818,9 +3858,13 @@ void btrfs_evict_inode(struct inode *inode)
                trans->block_rsv = rsv;
 
                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
-               if (ret != -EAGAIN)
+               if (ret != -ENOSPC)
                        break;
 
+               trans->block_rsv = &root->fs_info->trans_block_rsv;
+               ret = btrfs_update_inode(trans, root, inode);
+               BUG_ON(ret);
+
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
                trans = NULL;
@@ -4470,10 +4514,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
                        trans = btrfs_join_transaction(root);
                if (IS_ERR(trans))
                        return PTR_ERR(trans);
-               if (nolock)
-                       ret = btrfs_end_transaction_nolock(trans, root);
-               else
-                       ret = btrfs_commit_transaction(trans, root);
+               ret = btrfs_commit_transaction(trans, root);
        }
        return ret;
 }
@@ -4671,6 +4712,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        BTRFS_I(inode)->generation = trans->transid;
        inode->i_generation = BTRFS_I(inode)->generation;
 
+       /*
+        * We could have gotten an inode number from somebody who was fsynced
+        * and then removed in this same transaction, so let's just set full
+        * sync since it will be a full sync anyway and this will blow away the
+        * old info in the log.
+        */
+       set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+
        if (S_ISDIR(mode))
                owner = 0;
        else
@@ -4680,6 +4729,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
        key[0].offset = 0;
 
+       /*
+        * Start new inodes with an inode_ref. This is slightly more
+        * efficient for small numbers of hard links since they will
+        * be packed into one item. Extended refs will kick in if we
+        * add more hard links than can fit in the ref item.
+        */
        key[1].objectid = objectid;
        btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
        key[1].offset = ref_objectid;
@@ -4986,7 +5041,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        if (root->objectid != BTRFS_I(inode)->root->objectid)
                return -EXDEV;
 
-       if (inode->i_nlink == ~0U)
+       if (inode->i_nlink >= BTRFS_LINK_MAX)
                return -EMLINK;
 
        err = btrfs_set_inode_index(dir, &index);
@@ -5450,7 +5505,8 @@ insert:
        write_unlock(&em_tree->lock);
 out:
 
-       trace_btrfs_get_extent(root, em);
+       if (em)
+               trace_btrfs_get_extent(root, em);
 
        if (path)
                btrfs_free_path(path);
@@ -5836,6 +5892,48 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
        return ret;
 }
 
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+                                          u64 len, u64 orig_start,
+                                          u64 block_start, u64 block_len,
+                                          int type)
+{
+       struct extent_map_tree *em_tree;
+       struct extent_map *em;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret;
+
+       em_tree = &BTRFS_I(inode)->extent_tree;
+       em = alloc_extent_map();
+       if (!em)
+               return ERR_PTR(-ENOMEM);
+
+       em->start = start;
+       em->orig_start = orig_start;
+       em->len = len;
+       em->block_len = block_len;
+       em->block_start = block_start;
+       em->bdev = root->fs_info->fs_devices->latest_bdev;
+       set_bit(EXTENT_FLAG_PINNED, &em->flags);
+       if (type == BTRFS_ORDERED_PREALLOC)
+               set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+
+       do {
+               btrfs_drop_extent_cache(inode, em->start,
+                               em->start + em->len - 1, 0);
+               write_lock(&em_tree->lock);
+               ret = add_extent_mapping(em_tree, em);
+               write_unlock(&em_tree->lock);
+       } while (ret == -EEXIST);
+
+       if (ret) {
+               free_extent_map(em);
+               return ERR_PTR(ret);
+       }
+
+       return em;
+}
+
+
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
 {
@@ -5950,6 +6048,19 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                        goto must_cow;
 
                if (can_nocow_odirect(trans, inode, start, len) == 1) {
+                       u64 orig_start = em->start;
+
+                       if (type == BTRFS_ORDERED_PREALLOC) {
+                               free_extent_map(em);
+                               em = create_pinned_em(inode, start, len,
+                                                      orig_start,
+                                                      block_start, len, type);
+                               if (IS_ERR(em)) {
+                                       btrfs_end_transaction(trans, root);
+                                       goto unlock_err;
+                               }
+                       }
+
                        ret = btrfs_add_ordered_extent_dio(inode, start,
                                           block_start, len, len, type);
                        btrfs_end_transaction(trans, root);
@@ -5999,7 +6110,8 @@ unlock:
        if (lockstart < lockend) {
                if (create && len < lockend - lockstart) {
                        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-                                        lockstart + len - 1, unlock_bits, 1, 0,
+                                        lockstart + len - 1,
+                                        unlock_bits | EXTENT_DEFRAG, 1, 0,
                                         &cached_state, GFP_NOFS);
                        /*
                         * Beside unlock, we also need to cleanup reserved space
@@ -6007,8 +6119,8 @@ unlock:
                         */
                        clear_extent_bit(&BTRFS_I(inode)->io_tree,
                                         lockstart + len, lockend,
-                                        unlock_bits | EXTENT_DO_ACCOUNTING,
-                                        1, 0, NULL, GFP_NOFS);
+                                        unlock_bits | EXTENT_DO_ACCOUNTING |
+                                        EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
                } else {
                        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
                                         lockend, unlock_bits, 1, 0,
@@ -6573,8 +6685,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
                 */
                clear_extent_bit(tree, page_start, page_end,
                                 EXTENT_DIRTY | EXTENT_DELALLOC |
-                                EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
-                                &cached_state, GFP_NOFS);
+                                EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+                                EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
                /*
                 * whoever cleared the private bit is responsible
                 * for the finish_ordered_io
@@ -6590,7 +6702,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
        }
        clear_extent_bit(tree, page_start, page_end,
                 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-                EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
+                EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
+                &cached_state, GFP_NOFS);
        __btrfs_releasepage(page, GFP_NOFS);
 
        ClearPageChecked(page);
@@ -6687,7 +6800,8 @@ again:
         * prepare_pages in the normal write path.
         */
        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-                         EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+                         EXTENT_DIRTY | EXTENT_DELALLOC |
+                         EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
                          0, 0, &cached_state, GFP_NOFS);
 
        ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -6718,6 +6832,7 @@ again:
 
        BTRFS_I(inode)->last_trans = root->fs_info->generation;
        BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+       BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
 
        unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
 
@@ -6745,7 +6860,7 @@ static int btrfs_truncate(struct inode *inode)
        u64 mask = root->sectorsize - 1;
        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 
-       ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+       ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
        if (ret)
                return ret;
 
@@ -6788,10 +6903,11 @@ static int btrfs_truncate(struct inode *inode)
         * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
         * updating the inode.
         */
-       rsv = btrfs_alloc_block_rsv(root);
+       rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
        if (!rsv)
                return -ENOMEM;
        rsv->size = min_size;
+       rsv->failfast = 1;
 
        /*
         * 1 for the truncate slack space
@@ -6837,36 +6953,21 @@ static int btrfs_truncate(struct inode *inode)
                                           &BTRFS_I(inode)->runtime_flags))
                btrfs_add_ordered_operation(trans, root, inode);
 
-       while (1) {
-               ret = btrfs_block_rsv_refill(root, rsv, min_size);
-               if (ret) {
-                       /*
-                        * This can only happen with the original transaction we
-                        * started above, every other time we shouldn't have a
-                        * transaction started yet.
-                        */
-                       if (ret == -EAGAIN)
-                               goto end_trans;
-                       err = ret;
-                       break;
-               }
-
-               if (!trans) {
-                       /* Just need the 1 for updating the inode */
-                       trans = btrfs_start_transaction(root, 1);
-                       if (IS_ERR(trans)) {
-                               ret = err = PTR_ERR(trans);
-                               trans = NULL;
-                               break;
-                       }
-               }
-
-               trans->block_rsv = rsv;
+       /*
+        * So if we truncate and then write and fsync we normally would just
+        * write the extents that changed, which is a problem if we need to
+        * first truncate that entire inode.  So set this flag so we write out
+        * all of the extents in the inode to the sync log so we're completely
+        * safe.
+        */
+       set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+       trans->block_rsv = rsv;
 
+       while (1) {
                ret = btrfs_truncate_inode_items(trans, root, inode,
                                                 inode->i_size,
                                                 BTRFS_EXTENT_DATA_KEY);
-               if (ret != -EAGAIN) {
+               if (ret != -ENOSPC) {
                        err = ret;
                        break;
                }
@@ -6877,11 +6978,22 @@ static int btrfs_truncate(struct inode *inode)
                        err = ret;
                        break;
                }
-end_trans:
+
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
-               trans = NULL;
                btrfs_btree_balance_dirty(root, nr);
+
+               trans = btrfs_start_transaction(root, 2);
+               if (IS_ERR(trans)) {
+                       ret = err = PTR_ERR(trans);
+                       trans = NULL;
+                       break;
+               }
+
+               ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+                                             rsv, min_size);
+               BUG_ON(ret);    /* shouldn't happen */
+               trans->block_rsv = rsv;
        }
 
        if (ret == 0 && inode->i_nlink > 0) {
@@ -6965,6 +7077,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->csum_bytes = 0;
        ei->index_cnt = (u64)-1;
        ei->last_unlink_trans = 0;
+       ei->last_log_commit = 0;
 
        spin_lock_init(&ei->lock);
        ei->outstanding_extents = 0;
@@ -7095,31 +7208,31 @@ void btrfs_destroy_cachep(void)
 
 int btrfs_init_cachep(void)
 {
-       btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
+       btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
                        sizeof(struct btrfs_inode), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
        if (!btrfs_inode_cachep)
                goto fail;
 
-       btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
+       btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
                        sizeof(struct btrfs_trans_handle), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!btrfs_trans_handle_cachep)
                goto fail;
 
-       btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
+       btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
                        sizeof(struct btrfs_transaction), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!btrfs_transaction_cachep)
                goto fail;
 
-       btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
+       btrfs_path_cachep = kmem_cache_create("btrfs_path",
                        sizeof(struct btrfs_path), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!btrfs_path_cachep)
                goto fail;
 
-       btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
+       btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
                        sizeof(struct btrfs_free_space), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!btrfs_free_space_cachep)
@@ -7513,6 +7626,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                                       loff_t actual_len, u64 *alloc_hint,
                                       struct btrfs_trans_handle *trans)
 {
+       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       struct extent_map *em;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 cur_offset = start;
@@ -7553,6 +7668,37 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                btrfs_drop_extent_cache(inode, cur_offset,
                                        cur_offset + ins.offset -1, 0);
 
+               em = alloc_extent_map();
+               if (!em) {
+                       set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                               &BTRFS_I(inode)->runtime_flags);
+                       goto next;
+               }
+
+               em->start = cur_offset;
+               em->orig_start = cur_offset;
+               em->len = ins.offset;
+               em->block_start = ins.objectid;
+               em->block_len = ins.offset;
+               em->bdev = root->fs_info->fs_devices->latest_bdev;
+               set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+               em->generation = trans->transid;
+
+               while (1) {
+                       write_lock(&em_tree->lock);
+                       ret = add_extent_mapping(em_tree, em);
+                       if (!ret)
+                               list_move(&em->list,
+                                         &em_tree->modified_extents);
+                       write_unlock(&em_tree->lock);
+                       if (ret != -EEXIST)
+                               break;
+                       btrfs_drop_extent_cache(inode, cur_offset,
+                                               cur_offset + ins.offset - 1,
+                                               0);
+               }
+               free_extent_map(em);
+next:
                num_bytes -= ins.offset;
                cur_offset += ins.offset;
                *alloc_hint = ins.objectid + ins.offset;
index 47127c1bd290b5eb91023e8877176dbf3cfb7e58..e568c472f80771c9230b9967e8a2e4169e1e452c 100644 (file)
@@ -181,6 +181,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        int ret;
        u64 ip_oldflags;
        unsigned int i_oldflags;
+       umode_t mode;
 
        if (btrfs_root_readonly(root))
                return -EROFS;
@@ -203,6 +204,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
        ip_oldflags = ip->flags;
        i_oldflags = inode->i_flags;
+       mode = inode->i_mode;
 
        flags = btrfs_mask_flags(inode->i_mode, flags);
        oldflags = btrfs_flags_to_ioctl(ip->flags);
@@ -237,10 +239,31 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
                ip->flags |= BTRFS_INODE_DIRSYNC;
        else
                ip->flags &= ~BTRFS_INODE_DIRSYNC;
-       if (flags & FS_NOCOW_FL)
-               ip->flags |= BTRFS_INODE_NODATACOW;
-       else
-               ip->flags &= ~BTRFS_INODE_NODATACOW;
+       if (flags & FS_NOCOW_FL) {
+               if (S_ISREG(mode)) {
+                       /*
+                        * It's safe to turn csums off here, no extents exist.
+                        * Otherwise we want the flag to reflect the real COW
+                        * status of the file and will not set it.
+                        */
+                       if (inode->i_size == 0)
+                               ip->flags |= BTRFS_INODE_NODATACOW
+                                          | BTRFS_INODE_NODATASUM;
+               } else {
+                       ip->flags |= BTRFS_INODE_NODATACOW;
+               }
+       } else {
+               /*
+                * Revert back under same assuptions as above
+                */
+               if (S_ISREG(mode)) {
+                       if (inode->i_size == 0)
+                               ip->flags &= ~(BTRFS_INODE_NODATACOW
+                                            | BTRFS_INODE_NODATASUM);
+               } else {
+                       ip->flags &= ~BTRFS_INODE_NODATACOW;
+               }
+       }
 
        /*
         * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
@@ -516,7 +539,8 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        if (!pending_snapshot)
                return -ENOMEM;
 
-       btrfs_init_block_rsv(&pending_snapshot->block_rsv);
+       btrfs_init_block_rsv(&pending_snapshot->block_rsv,
+                            BTRFS_BLOCK_RSV_TEMP);
        pending_snapshot->dentry = dentry;
        pending_snapshot->root = root;
        pending_snapshot->readonly = readonly;
@@ -525,7 +549,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
                *inherit = NULL;        /* take responsibility to free it */
        }
 
-       trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
+       trans = btrfs_start_transaction(root->fs_info->extent_root, 6);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto fail;
@@ -1022,8 +1046,8 @@ again:
                         page_start, page_end - 1, 0, &cached_state);
        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
                          page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
-                         EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
-                         GFP_NOFS);
+                         EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+                         &cached_state, GFP_NOFS);
 
        if (i_done != page_cnt) {
                spin_lock(&BTRFS_I(inode)->lock);
@@ -1034,8 +1058,8 @@ again:
        }
 
 
-       btrfs_set_extent_delalloc(inode, page_start, page_end - 1,
-                                 &cached_state);
+       set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
+                         &cached_state, GFP_NOFS);
 
        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
                             page_start, page_end - 1, &cached_state,
@@ -2351,7 +2375,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        int ret;
        u64 len = olen;
        u64 bs = root->fs_info->sb->s_blocksize;
-       u64 hint_byte;
 
        /*
         * TODO:
@@ -2456,13 +2479,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
           another, and lock file content */
        while (1) {
                struct btrfs_ordered_extent *ordered;
-               lock_extent(&BTRFS_I(src)->io_tree, off, off+len);
-               ordered = btrfs_lookup_first_ordered_extent(src, off+len);
+               lock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
+               ordered = btrfs_lookup_first_ordered_extent(src, off + len - 1);
                if (!ordered &&
-                   !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
-                                  EXTENT_DELALLOC, 0, NULL))
+                   !test_range_bit(&BTRFS_I(src)->io_tree, off, off + len - 1,
+                                   EXTENT_DELALLOC, 0, NULL))
                        break;
-               unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
+               unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
                btrfs_wait_ordered_range(src, off, len);
@@ -2536,7 +2559,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        btrfs_release_path(path);
 
                        if (key.offset + datal <= off ||
-                           key.offset >= off+len)
+                           key.offset >= off + len - 1)
                                goto next;
 
                        memcpy(&new_key, &key, sizeof(new_key));
@@ -2574,10 +2597,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                        datal -= off - key.offset;
                                }
 
-                               ret = btrfs_drop_extents(trans, inode,
+                               ret = btrfs_drop_extents(trans, root, inode,
                                                         new_key.offset,
                                                         new_key.offset + datal,
-                                                        &hint_byte, 1);
+                                                        1);
                                if (ret) {
                                        btrfs_abort_transaction(trans, root,
                                                                ret);
@@ -2637,8 +2660,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                        new_key.offset += skip;
                                }
 
-                               if (key.offset + datal > off+len)
-                                       trim = key.offset + datal - (off+len);
+                               if (key.offset + datal > off + len)
+                                       trim = key.offset + datal - (off + len);
 
                                if (comp && (skip || trim)) {
                                        ret = -EINVAL;
@@ -2648,10 +2671,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                size -= skip + trim;
                                datal -= skip + trim;
 
-                               ret = btrfs_drop_extents(trans, inode,
+                               ret = btrfs_drop_extents(trans, root, inode,
                                                         new_key.offset,
                                                         new_key.offset + datal,
-                                                        &hint_byte, 1);
+                                                        1);
                                if (ret) {
                                        btrfs_abort_transaction(trans, root,
                                                                ret);
@@ -2715,7 +2738,7 @@ next:
        ret = 0;
 out:
        btrfs_release_path(path);
-       unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
+       unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
 out_unlock:
        mutex_unlock(&src->i_mutex);
        mutex_unlock(&inode->i_mutex);
@@ -2850,8 +2873,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        return 0;
 }
 
-static void get_block_group_info(struct list_head *groups_list,
-                                struct btrfs_ioctl_space_info *space)
+void btrfs_get_block_group_info(struct list_head *groups_list,
+                               struct btrfs_ioctl_space_info *space)
 {
        struct btrfs_block_group_cache *block_group;
 
@@ -2959,8 +2982,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
                down_read(&info->groups_sem);
                for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
                        if (!list_empty(&info->block_groups[c])) {
-                               get_block_group_info(&info->block_groups[c],
-                                                    &space);
+                               btrfs_get_block_group_info(
+                                       &info->block_groups[c], &space);
                                memcpy(dest, &space, sizeof(space));
                                dest++;
                                space_args.total_spaces++;
@@ -3208,11 +3231,9 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 {
        int ret = 0;
        int size;
-       u64 extent_item_pos;
        struct btrfs_ioctl_logical_ino_args *loi;
        struct btrfs_data_container *inodes = NULL;
        struct btrfs_path *path = NULL;
-       struct btrfs_key key;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -3230,7 +3251,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
                goto out;
        }
 
-       size = min_t(u32, loi->size, 4096);
+       size = min_t(u32, loi->size, 64 * 1024);
        inodes = init_data_container(size);
        if (IS_ERR(inodes)) {
                ret = PTR_ERR(inodes);
@@ -3238,22 +3259,13 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
                goto out;
        }
 
-       ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
-       btrfs_release_path(path);
-
-       if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+       ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
+                                         build_ino_list, inodes);
+       if (ret == -EINVAL)
                ret = -ENOENT;
        if (ret < 0)
                goto out;
 
-       extent_item_pos = loi->logical - key.objectid;
-       ret = iterate_extent_inodes(root->fs_info, key.objectid,
-                                       extent_item_pos, 0, build_ino_list,
-                                       inodes);
-
-       if (ret < 0)
-               goto out;
-
        ret = copy_to_user((void *)(unsigned long)loi->inodes,
                           (void *)(unsigned long)inodes, size);
        if (ret)
@@ -3261,7 +3273,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 
 out:
        btrfs_free_path(path);
-       kfree(inodes);
+       vfree(inodes);
        kfree(loi);
 
        return ret;
index 051c7fe551dd38bb7e391e5abaeff992f32b90f7..7772f02ba28e6966826c0d897475961ad628c2f5 100644 (file)
@@ -25,6 +25,8 @@
 #include "btrfs_inode.h"
 #include "extent_io.h"
 
+static struct kmem_cache *btrfs_ordered_extent_cache;
+
 static u64 entry_end(struct btrfs_ordered_extent *entry)
 {
        if (entry->file_offset + entry->len < entry->file_offset)
@@ -187,7 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        struct btrfs_ordered_extent *entry;
 
        tree = &BTRFS_I(inode)->ordered_tree;
-       entry = kzalloc(sizeof(*entry), GFP_NOFS);
+       entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
        if (!entry)
                return -ENOMEM;
 
@@ -421,7 +423,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
                        list_del(&sum->list);
                        kfree(sum);
                }
-               kfree(entry);
+               kmem_cache_free(btrfs_ordered_extent_cache, entry);
        }
 }
 
@@ -466,8 +468,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
  * wait for all the ordered extents in a root.  This is done when balancing
  * space between drives.
  */
-void btrfs_wait_ordered_extents(struct btrfs_root *root,
-                               int nocow_only, int delay_iput)
+void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 {
        struct list_head splice;
        struct list_head *cur;
@@ -482,15 +483,6 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root,
                cur = splice.next;
                ordered = list_entry(cur, struct btrfs_ordered_extent,
                                     root_extent_list);
-               if (nocow_only &&
-                   !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
-                   !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
-                       list_move(&ordered->root_extent_list,
-                                 &root->fs_info->ordered_extents);
-                       cond_resched_lock(&root->fs_info->ordered_extent_lock);
-                       continue;
-               }
-
                list_del_init(&ordered->root_extent_list);
                atomic_inc(&ordered->refs);
 
@@ -775,7 +767,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
        struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
        u64 disk_i_size;
        u64 new_i_size;
-       u64 i_size_test;
        u64 i_size = i_size_read(inode);
        struct rb_node *node;
        struct rb_node *prev = NULL;
@@ -835,55 +826,30 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                        break;
                if (test->file_offset >= i_size)
                        break;
-               if (test->file_offset >= disk_i_size)
+               if (test->file_offset >= disk_i_size) {
+                       /*
+                        * we don't update disk_i_size now, so record this
+                        * undealt i_size. Or we will not know the real
+                        * i_size.
+                        */
+                       if (test->outstanding_isize < offset)
+                               test->outstanding_isize = offset;
+                       if (ordered &&
+                           ordered->outstanding_isize >
+                           test->outstanding_isize)
+                               test->outstanding_isize =
+                                               ordered->outstanding_isize;
                        goto out;
-       }
-       new_i_size = min_t(u64, offset, i_size);
-
-       /*
-        * at this point, we know we can safely update i_size to at least
-        * the offset from this ordered extent.  But, we need to
-        * walk forward and see if ios from higher up in the file have
-        * finished.
-        */
-       if (ordered) {
-               node = rb_next(&ordered->rb_node);
-       } else {
-               if (prev)
-                       node = rb_next(prev);
-               else
-                       node = rb_first(&tree->tree);
-       }
-
-       /*
-        * We are looking for an area between our current extent and the next
-        * ordered extent to update the i_size to.  There are 3 cases here
-        *
-        * 1) We don't actually have anything and we can update to i_size.
-        * 2) We have stuff but they already did their i_size update so again we
-        * can just update to i_size.
-        * 3) We have an outstanding ordered extent so the most we can update
-        * our disk_i_size to is the start of the next offset.
-        */
-       i_size_test = i_size;
-       for (; node; node = rb_next(node)) {
-               test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-
-               if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
-                       continue;
-               if (test->file_offset > offset) {
-                       i_size_test = test->file_offset;
-                       break;
                }
        }
+       new_i_size = min_t(u64, offset, i_size);
 
        /*
-        * i_size_test is the end of a region after this ordered
-        * extent where there are no ordered extents, we can safely set
-        * disk_i_size to this.
+        * Some ordered extents may completed before the current one, and
+        * we hold the real i_size in ->outstanding_isize.
         */
-       if (i_size_test > offset)
-               new_i_size = min_t(u64, i_size_test, i_size);
+       if (ordered && ordered->outstanding_isize > new_i_size)
+               new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
        BTRFS_I(inode)->disk_i_size = new_i_size;
        ret = 0;
 out:
@@ -984,3 +950,20 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
        }
        spin_unlock(&root->fs_info->ordered_extent_lock);
 }
+
+int __init ordered_data_init(void)
+{
+       btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
+                                    sizeof(struct btrfs_ordered_extent), 0,
+                                    SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                    NULL);
+       if (!btrfs_ordered_extent_cache)
+               return -ENOMEM;
+       return 0;
+}
+
+void ordered_data_exit(void)
+{
+       if (btrfs_ordered_extent_cache)
+               kmem_cache_destroy(btrfs_ordered_extent_cache);
+}
index e03c560d299732cfe2114fe41d049b691a949e61..dd27a0b46a37d1dfd878828672dfb0dbe99f5ac8 100644 (file)
@@ -96,6 +96,13 @@ struct btrfs_ordered_extent {
        /* number of bytes that still need writing */
        u64 bytes_left;
 
+       /*
+        * the end of the ordered extent which is behind it but
+        * didn't update disk_i_size. Please see the comment of
+        * btrfs_ordered_update_i_size();
+        */
+       u64 outstanding_isize;
+
        /* flags (described above) */
        unsigned long flags;
 
@@ -183,6 +190,7 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct inode *inode);
-void btrfs_wait_ordered_extents(struct btrfs_root *root,
-                               int nocow_only, int delay_iput);
+void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
+int __init ordered_data_init(void);
+void ordered_data_exit(void);
 #endif
index b65015581744a6eefb9b3ed720478673b888d336..5039686df6ae8e801ed8985eb5e821a226e18855 100644 (file)
@@ -1145,12 +1145,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
                ulist_reinit(tmp);
                                                /* XXX id not needed */
-               ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
+               ulist_add(tmp, qg->qgroupid, (u64)(uintptr_t)qg, GFP_ATOMIC);
                ULIST_ITER_INIT(&tmp_uiter);
                while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
                        struct btrfs_qgroup_list *glist;
 
-                       qg = (struct btrfs_qgroup *)tmp_unode->aux;
+                       qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
                        if (qg->refcnt < seq)
                                qg->refcnt = seq + 1;
                        else
@@ -1158,7 +1158,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
                        list_for_each_entry(glist, &qg->groups, next_group) {
                                ulist_add(tmp, glist->group->qgroupid,
-                                         (unsigned long)glist->group,
+                                         (u64)(uintptr_t)glist->group,
                                          GFP_ATOMIC);
                        }
                }
@@ -1168,13 +1168,13 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
         * step 2: walk from the new root
         */
        ulist_reinit(tmp);
-       ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+       ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
        ULIST_ITER_INIT(&uiter);
        while ((unode = ulist_next(tmp, &uiter))) {
                struct btrfs_qgroup *qg;
                struct btrfs_qgroup_list *glist;
 
-               qg = (struct btrfs_qgroup *)unode->aux;
+               qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
                if (qg->refcnt < seq) {
                        /* not visited by step 1 */
                        qg->rfer += sgn * node->num_bytes;
@@ -1190,7 +1190,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
                list_for_each_entry(glist, &qg->groups, next_group) {
                        ulist_add(tmp, glist->group->qgroupid,
-                                 (unsigned long)glist->group, GFP_ATOMIC);
+                                 (uintptr_t)glist->group, GFP_ATOMIC);
                }
        }
 
@@ -1208,12 +1208,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
                        continue;
 
                ulist_reinit(tmp);
-               ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
+               ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
                ULIST_ITER_INIT(&tmp_uiter);
                while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
                        struct btrfs_qgroup_list *glist;
 
-                       qg = (struct btrfs_qgroup *)tmp_unode->aux;
+                       qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
                        if (qg->tag == seq)
                                continue;
 
@@ -1225,7 +1225,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
                        list_for_each_entry(glist, &qg->groups, next_group) {
                                ulist_add(tmp, glist->group->qgroupid,
-                                         (unsigned long)glist->group,
+                                         (uintptr_t)glist->group,
                                          GFP_ATOMIC);
                        }
                }
@@ -1469,13 +1469,17 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
         * be exceeded
         */
        ulist = ulist_alloc(GFP_ATOMIC);
-       ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+       if (!ulist) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
        ULIST_ITER_INIT(&uiter);
        while ((unode = ulist_next(ulist, &uiter))) {
                struct btrfs_qgroup *qg;
                struct btrfs_qgroup_list *glist;
 
-               qg = (struct btrfs_qgroup *)unode->aux;
+               qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
 
                if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
                    qg->reserved + qg->rfer + num_bytes >
@@ -1489,7 +1493,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 
                list_for_each_entry(glist, &qg->groups, next_group) {
                        ulist_add(ulist, glist->group->qgroupid,
-                                 (unsigned long)glist->group, GFP_ATOMIC);
+                                 (uintptr_t)glist->group, GFP_ATOMIC);
                }
        }
        if (ret)
@@ -1502,7 +1506,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
        while ((unode = ulist_next(ulist, &uiter))) {
                struct btrfs_qgroup *qg;
 
-               qg = (struct btrfs_qgroup *)unode->aux;
+               qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
 
                qg->reserved += num_bytes;
        }
@@ -1541,19 +1545,23 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
                goto out;
 
        ulist = ulist_alloc(GFP_ATOMIC);
-       ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+       if (!ulist) {
+               btrfs_std_error(fs_info, -ENOMEM);
+               goto out;
+       }
+       ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
        ULIST_ITER_INIT(&uiter);
        while ((unode = ulist_next(ulist, &uiter))) {
                struct btrfs_qgroup *qg;
                struct btrfs_qgroup_list *glist;
 
-               qg = (struct btrfs_qgroup *)unode->aux;
+               qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
 
                qg->reserved -= num_bytes;
 
                list_for_each_entry(glist, &qg->groups, next_group) {
                        ulist_add(ulist, glist->group->qgroupid,
-                                 (unsigned long)glist->group, GFP_ATOMIC);
+                                 (uintptr_t)glist->group, GFP_ATOMIC);
                }
        }
 
index 4da08652004d5dc2803dba510c0f9b015607b4a1..776f0aa128fc56294dbed997d6a60768f05a0ee9 100644 (file)
@@ -3270,8 +3270,8 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
        key.offset = 0;
 
        inode = btrfs_iget(fs_info->sb, &key, root, NULL);
-       if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) {
-               if (inode && !IS_ERR(inode))
+       if (IS_ERR(inode) || is_bad_inode(inode)) {
+               if (!IS_ERR(inode))
                        iput(inode);
                return -ENOENT;
        }
@@ -3621,7 +3621,7 @@ next:
 
                ret = find_first_extent_bit(&rc->processed_blocks,
                                            key.objectid, &start, &end,
-                                           EXTENT_DIRTY);
+                                           EXTENT_DIRTY, NULL);
 
                if (ret == 0 && start <= key.objectid) {
                        btrfs_release_path(path);
@@ -3674,7 +3674,8 @@ int prepare_to_relocate(struct reloc_control *rc)
        struct btrfs_trans_handle *trans;
        int ret;
 
-       rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
+       rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
+                                             BTRFS_BLOCK_RSV_TEMP);
        if (!rc->block_rsv)
                return -ENOMEM;
 
@@ -4057,7 +4058,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
               (unsigned long long)rc->block_group->flags);
 
        btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
-       btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
+       btrfs_wait_ordered_extents(fs_info->tree_root, 0);
 
        while (1) {
                mutex_lock(&fs_info->cleaner_mutex);
index 10d8e4d88071747651afd3a875eae0fb407c22e4..eb923d087da7848d445820213d53ff3b39d7d279 100644 (file)
@@ -141,8 +141,10 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
                return -ENOMEM;
 
        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-       if (ret < 0)
-               goto out_abort;
+       if (ret < 0) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out;
+       }
 
        if (ret != 0) {
                btrfs_print_leaf(root, path->nodes[0]);
@@ -166,16 +168,23 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
                btrfs_release_path(path);
                ret = btrfs_search_slot(trans, root, key, path,
                                -1, 1);
-               if (ret < 0)
-                       goto out_abort;
+               if (ret < 0) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out;
+               }
+
                ret = btrfs_del_item(trans, root, path);
-               if (ret < 0)
-                       goto out_abort;
+               if (ret < 0) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out;
+               }
                btrfs_release_path(path);
                ret = btrfs_insert_empty_item(trans, root, path,
                                key, sizeof(*item));
-               if (ret < 0)
-                       goto out_abort;
+               if (ret < 0) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out;
+               }
                l = path->nodes[0];
                slot = path->slots[0];
                ptr = btrfs_item_ptr_offset(l, slot);
@@ -192,10 +201,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 out:
        btrfs_free_path(path);
        return ret;
-
-out_abort:
-       btrfs_abort_transaction(trans, root, ret);
-       goto out;
 }
 
 int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
index b223620cd5a6d59aa4b707e2539e59a8244d2dd1..27892f67e69b216694299720caf31b6e19919555 100644 (file)
@@ -352,13 +352,14 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
        struct extent_buffer *eb;
        struct btrfs_extent_item *ei;
        struct scrub_warning swarn;
-       u32 item_size;
-       int ret;
+       unsigned long ptr = 0;
+       u64 extent_item_pos;
+       u64 flags = 0;
        u64 ref_root;
+       u32 item_size;
        u8 ref_level;
-       unsigned long ptr = 0;
        const int bufsize = 4096;
-       u64 extent_item_pos;
+       int ret;
 
        path = btrfs_alloc_path();
 
@@ -375,7 +376,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
        if (!path || !swarn.scratch_buf || !swarn.msg_buf)
                goto out;
 
-       ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
+       ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
+                                 &flags);
        if (ret < 0)
                goto out;
 
@@ -387,7 +389,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
        item_size = btrfs_item_size_nr(eb, path->slots[0]);
        btrfs_release_path(path);
 
-       if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+       if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                do {
                        ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
                                                        &ref_root, &ref_level);
@@ -1029,6 +1031,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
                                spin_lock(&sdev->stat_lock);
                                sdev->stat.malloc_errors++;
                                spin_unlock(&sdev->stat_lock);
+                               kfree(bbio);
                                return -ENOMEM;
                        }
                        sblock->page_count++;
@@ -1666,21 +1669,6 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
                scrub_block_put(sblock);
        }
 
-       if (sbio->err) {
-               /* what is this good for??? */
-               sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
-               sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
-               sbio->bio->bi_phys_segments = 0;
-               sbio->bio->bi_idx = 0;
-
-               for (i = 0; i < sbio->page_count; i++) {
-                       struct bio_vec *bi;
-                       bi = &sbio->bio->bi_io_vec[i];
-                       bi->bv_offset = 0;
-                       bi->bv_len = PAGE_SIZE;
-               }
-       }
-
        bio_put(sbio->bio);
        sbio->bio = NULL;
        spin_lock(&sdev->list_lock);
index fb5ffe95f869342c6f270579c7f3556aef6f80d0..c7beb543a4a89300f1e586492b767ea8f9bef683 100644 (file)
@@ -107,7 +107,6 @@ struct send_ctx {
        int cur_inode_new;
        int cur_inode_new_gen;
        int cur_inode_deleted;
-       int cur_inode_first_ref_orphan;
        u64 cur_inode_size;
        u64 cur_inode_mode;
 
@@ -126,7 +125,15 @@ struct send_ctx {
 
 struct name_cache_entry {
        struct list_head list;
-       struct list_head use_list;
+       /*
+        * radix_tree has only 32bit entries but we need to handle 64bit inums.
+        * We use the lower 32bit of the 64bit inum to store it in the tree. If
+        * more then one inum would fall into the same entry, we use radix_list
+        * to store the additional entries. radix_list is also used to store
+        * entries where two entries have the same inum but different
+        * generations.
+        */
+       struct list_head radix_list;
        u64 ino;
        u64 gen;
        u64 parent_ino;
@@ -328,6 +335,7 @@ out:
        return ret;
 }
 
+#if 0
 static void fs_path_remove(struct fs_path *p)
 {
        BUG_ON(p->reversed);
@@ -335,6 +343,7 @@ static void fs_path_remove(struct fs_path *p)
                p->end--;
        *p->end = 0;
 }
+#endif
 
 static int fs_path_copy(struct fs_path *p, struct fs_path *from)
 {
@@ -377,7 +386,7 @@ static struct btrfs_path *alloc_path_for_send(void)
        return path;
 }
 
-static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)
+int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
 {
        int ret;
        mm_segment_t old_fs;
@@ -387,8 +396,7 @@ static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)
        set_fs(KERNEL_DS);
 
        while (pos < len) {
-               ret = vfs_write(sctx->send_filp, (char *)buf + pos, len - pos,
-                               &sctx->send_off);
+               ret = vfs_write(filp, (char *)buf + pos, len - pos, off);
                /* TODO handle that correctly */
                /*if (ret == -ERESTARTSYS) {
                        continue;
@@ -544,7 +552,8 @@ static int send_header(struct send_ctx *sctx)
        strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
        hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
 
-       return write_buf(sctx, &hdr, sizeof(hdr));
+       return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
+                                       &sctx->send_off);
 }
 
 /*
@@ -581,7 +590,8 @@ static int send_cmd(struct send_ctx *sctx)
        crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
        hdr->crc = cpu_to_le32(crc);
 
-       ret = write_buf(sctx, sctx->send_buf, sctx->send_size);
+       ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
+                                       &sctx->send_off);
 
        sctx->total_send_size += sctx->send_size;
        sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
@@ -687,7 +697,8 @@ out:
  */
 static int get_inode_info(struct btrfs_root *root,
                          u64 ino, u64 *size, u64 *gen,
-                         u64 *mode, u64 *uid, u64 *gid)
+                         u64 *mode, u64 *uid, u64 *gid,
+                         u64 *rdev)
 {
        int ret;
        struct btrfs_inode_item *ii;
@@ -721,6 +732,8 @@ static int get_inode_info(struct btrfs_root *root,
                *uid = btrfs_inode_uid(path->nodes[0], ii);
        if (gid)
                *gid = btrfs_inode_gid(path->nodes[0], ii);
+       if (rdev)
+               *rdev = btrfs_inode_rdev(path->nodes[0], ii);
 
 out:
        btrfs_free_path(path);
@@ -852,7 +865,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
        struct extent_buffer *eb;
        struct btrfs_item *item;
        struct btrfs_dir_item *di;
-       struct btrfs_path *tmp_path = NULL;
        struct btrfs_key di_key;
        char *buf = NULL;
        char *buf2 = NULL;
@@ -874,12 +886,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
                goto out;
        }
 
-       tmp_path = alloc_path_for_send();
-       if (!tmp_path) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
        eb = path->nodes[0];
        slot = path->slots[0];
        item = btrfs_item_nr(eb, slot);
@@ -941,7 +947,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
        }
 
 out:
-       btrfs_free_path(tmp_path);
        if (buf_virtual)
                vfree(buf);
        else
@@ -1026,12 +1031,12 @@ struct backref_ctx {
        u64 extent_len;
 
        /* Just to check for bugs in backref resolving */
-       int found_in_send_root;
+       int found_itself;
 };
 
 static int __clone_root_cmp_bsearch(const void *key, const void *elt)
 {
-       u64 root = (u64)key;
+       u64 root = (u64)(uintptr_t)key;
        struct clone_root *cr = (struct clone_root *)elt;
 
        if (root < cr->root->objectid)
@@ -1055,6 +1060,7 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
 
 /*
  * Called for every backref that is found for the current extent.
+ * Results are collected in sctx->clone_roots->ino/offset/found_refs
  */
 static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 {
@@ -1064,7 +1070,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
        u64 i_size;
 
        /* First check if the root is in the list of accepted clone sources */
-       found = bsearch((void *)root, bctx->sctx->clone_roots,
+       found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
                        bctx->sctx->clone_roots_cnt,
                        sizeof(struct clone_root),
                        __clone_root_cmp_bsearch);
@@ -1074,14 +1080,15 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
        if (found->root == bctx->sctx->send_root &&
            ino == bctx->cur_objectid &&
            offset == bctx->cur_offset) {
-               bctx->found_in_send_root = 1;
+               bctx->found_itself = 1;
        }
 
        /*
-        * There are inodes that have extents that lie behind it's i_size. Don't
+        * There are inodes that have extents that lie behind its i_size. Don't
         * accept clones from these extents.
         */
-       ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL);
+       ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL,
+                       NULL);
        if (ret < 0)
                return ret;
 
@@ -1101,16 +1108,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
                 */
                if (ino >= bctx->cur_objectid)
                        return 0;
-               /*if (ino > ctx->cur_objectid)
+#if 0
+               if (ino > bctx->cur_objectid)
                        return 0;
-               if (offset + ctx->extent_len > ctx->cur_offset)
-                       return 0;*/
-
-               bctx->found++;
-               found->found_refs++;
-               found->ino = ino;
-               found->offset = offset;
-               return 0;
+               if (offset + bctx->extent_len > bctx->cur_offset)
+                       return 0;
+#endif
        }
 
        bctx->found++;
@@ -1130,6 +1133,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 }
 
 /*
+ * Given an inode, offset and extent item, it finds a good clone for a clone
+ * instruction. Returns -ENOENT when none could be found. The function makes
+ * sure that the returned clone is usable at the point where sending is at the
+ * moment. This means, that no clones are accepted which lie behind the current
+ * inode+offset.
+ *
  * path must point to the extent item when called.
  */
 static int find_extent_clone(struct send_ctx *sctx,
@@ -1141,20 +1150,29 @@ static int find_extent_clone(struct send_ctx *sctx,
        int ret;
        int extent_type;
        u64 logical;
+       u64 disk_byte;
        u64 num_bytes;
        u64 extent_item_pos;
+       u64 flags = 0;
        struct btrfs_file_extent_item *fi;
        struct extent_buffer *eb = path->nodes[0];
-       struct backref_ctx backref_ctx;
+       struct backref_ctx *backref_ctx = NULL;
        struct clone_root *cur_clone_root;
        struct btrfs_key found_key;
        struct btrfs_path *tmp_path;
+       int compressed;
        u32 i;
 
        tmp_path = alloc_path_for_send();
        if (!tmp_path)
                return -ENOMEM;
 
+       backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
+       if (!backref_ctx) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
        if (data_offset >= ino_size) {
                /*
                 * There may be extents that lie behind the file's size.
@@ -1172,22 +1190,23 @@ static int find_extent_clone(struct send_ctx *sctx,
                ret = -ENOENT;
                goto out;
        }
+       compressed = btrfs_file_extent_compression(eb, fi);
 
        num_bytes = btrfs_file_extent_num_bytes(eb, fi);
-       logical = btrfs_file_extent_disk_bytenr(eb, fi);
-       if (logical == 0) {
+       disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+       if (disk_byte == 0) {
                ret = -ENOENT;
                goto out;
        }
-       logical += btrfs_file_extent_offset(eb, fi);
+       logical = disk_byte + btrfs_file_extent_offset(eb, fi);
 
-       ret = extent_from_logical(sctx->send_root->fs_info,
-                       logical, tmp_path, &found_key);
+       ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
+                                 &found_key, &flags);
        btrfs_release_path(tmp_path);
 
        if (ret < 0)
                goto out;
-       if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+       if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                ret = -EIO;
                goto out;
        }
@@ -1202,12 +1221,12 @@ static int find_extent_clone(struct send_ctx *sctx,
                cur_clone_root->found_refs = 0;
        }
 
-       backref_ctx.sctx = sctx;
-       backref_ctx.found = 0;
-       backref_ctx.cur_objectid = ino;
-       backref_ctx.cur_offset = data_offset;
-       backref_ctx.found_in_send_root = 0;
-       backref_ctx.extent_len = num_bytes;
+       backref_ctx->sctx = sctx;
+       backref_ctx->found = 0;
+       backref_ctx->cur_objectid = ino;
+       backref_ctx->cur_offset = data_offset;
+       backref_ctx->found_itself = 0;
+       backref_ctx->extent_len = num_bytes;
 
        /*
         * The last extent of a file may be too large due to page alignment.
@@ -1215,25 +1234,31 @@ static int find_extent_clone(struct send_ctx *sctx,
         * __iterate_backrefs work.
         */
        if (data_offset + num_bytes >= ino_size)
-               backref_ctx.extent_len = ino_size - data_offset;
+               backref_ctx->extent_len = ino_size - data_offset;
 
        /*
         * Now collect all backrefs.
         */
+       if (compressed == BTRFS_COMPRESS_NONE)
+               extent_item_pos = logical - found_key.objectid;
+       else
+               extent_item_pos = 0;
+
        extent_item_pos = logical - found_key.objectid;
        ret = iterate_extent_inodes(sctx->send_root->fs_info,
                                        found_key.objectid, extent_item_pos, 1,
-                                       __iterate_backrefs, &backref_ctx);
+                                       __iterate_backrefs, backref_ctx);
+
        if (ret < 0)
                goto out;
 
-       if (!backref_ctx.found_in_send_root) {
+       if (!backref_ctx->found_itself) {
                /* found a bug in backref code? */
                ret = -EIO;
                printk(KERN_ERR "btrfs: ERROR did not find backref in "
                                "send_root. inode=%llu, offset=%llu, "
-                               "logical=%llu\n",
-                               ino, data_offset, logical);
+                               "disk_byte=%llu found extent=%llu\n",
+                               ino, data_offset, disk_byte, found_key.objectid);
                goto out;
        }
 
@@ -1242,7 +1267,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
                "num_bytes=%llu, logical=%llu\n",
                data_offset, ino, num_bytes, logical);
 
-       if (!backref_ctx.found)
+       if (!backref_ctx->found)
                verbose_printk("btrfs:    no clones found\n");
 
        cur_clone_root = NULL;
@@ -1253,7 +1278,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
                        else if (sctx->clone_roots[i].root == sctx->send_root)
                                /* prefer clones from send_root over others */
                                cur_clone_root = sctx->clone_roots + i;
-                       break;
                }
 
        }
@@ -1267,6 +1291,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
 
 out:
        btrfs_free_path(tmp_path);
+       kfree(backref_ctx);
        return ret;
 }
 
@@ -1307,8 +1332,6 @@ static int read_symlink(struct send_ctx *sctx,
        len = btrfs_file_extent_inline_len(path->nodes[0], ei);
 
        ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
-       if (ret < 0)
-               goto out;
 
 out:
        btrfs_free_path(path);
@@ -1404,7 +1427,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
        u64 right_gen;
 
        ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
-                       NULL);
+                       NULL, NULL);
        if (ret < 0 && ret != -ENOENT)
                goto out;
        left_ret = ret;
@@ -1413,16 +1436,16 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
                right_ret = -ENOENT;
        } else {
                ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
-                               NULL, NULL, NULL);
+                               NULL, NULL, NULL, NULL);
                if (ret < 0 && ret != -ENOENT)
                        goto out;
                right_ret = ret;
        }
 
        if (!left_ret && !right_ret) {
-               if (left_gen == gen && right_gen == gen)
+               if (left_gen == gen && right_gen == gen) {
                        ret = inode_state_no_change;
-               else if (left_gen == gen) {
+               else if (left_gen == gen) {
                        if (ino < sctx->send_progress)
                                ret = inode_state_did_create;
                        else
@@ -1516,6 +1539,10 @@ out:
        return ret;
 }
 
+/*
+ * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
+ * generation of the parent dir and the name of the dir entry.
+ */
 static int get_first_ref(struct send_ctx *sctx,
                         struct btrfs_root *root, u64 ino,
                         u64 *dir, u64 *dir_gen, struct fs_path *name)
@@ -1557,7 +1584,7 @@ static int get_first_ref(struct send_ctx *sctx,
        btrfs_release_path(path);
 
        ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL,
-                       NULL);
+                       NULL, NULL);
        if (ret < 0)
                goto out;
 
@@ -1586,22 +1613,28 @@ static int is_first_ref(struct send_ctx *sctx,
        if (ret < 0)
                goto out;
 
-       if (name_len != fs_path_len(tmp_name)) {
+       if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
                ret = 0;
                goto out;
        }
 
-       ret = memcmp(tmp_name->start, name, name_len);
-       if (ret)
-               ret = 0;
-       else
-               ret = 1;
+       ret = !memcmp(tmp_name->start, name, name_len);
 
 out:
        fs_path_free(sctx, tmp_name);
        return ret;
 }
 
+/*
+ * Used by process_recorded_refs to determine if a new ref would overwrite an
+ * already existing ref. In case it detects an overwrite, it returns the
+ * inode/gen in who_ino/who_gen.
+ * When an overwrite is detected, process_recorded_refs does proper orphanizing
+ * to make sure later references to the overwritten inode are possible.
+ * Orphanizing is however only required for the first ref of an inode.
+ * process_recorded_refs does an additional is_first_ref check to see if
+ * orphanizing is really required.
+ */
 static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
                              const char *name, int name_len,
                              u64 *who_ino, u64 *who_gen)
@@ -1626,9 +1659,14 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
                goto out;
        }
 
+       /*
+        * Check if the overwritten ref was already processed. If yes, the ref
+        * was already unlinked/moved, so we can safely assume that we will not
+        * overwrite anything at this point in time.
+        */
        if (other_inode > sctx->send_progress) {
                ret = get_inode_info(sctx->parent_root, other_inode, NULL,
-                               who_gen, NULL, NULL, NULL);
+                               who_gen, NULL, NULL, NULL, NULL);
                if (ret < 0)
                        goto out;
 
@@ -1642,6 +1680,13 @@ out:
        return ret;
 }
 
+/*
+ * Checks if the ref was overwritten by an already processed inode. This is
+ * used by __get_cur_name_and_parent to find out if the ref was orphanized and
+ * thus the orphan name needs be used.
+ * process_recorded_refs also uses it to avoid unlinking of refs that were
+ * overwritten.
+ */
 static int did_overwrite_ref(struct send_ctx *sctx,
                            u64 dir, u64 dir_gen,
                            u64 ino, u64 ino_gen,
@@ -1671,7 +1716,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
        }
 
        ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
-                       NULL);
+                       NULL, NULL);
        if (ret < 0)
                goto out;
 
@@ -1690,6 +1735,11 @@ out:
        return ret;
 }
 
+/*
+ * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
+ * that got overwritten. This is used by process_recorded_refs to determine
+ * if it has to use the path as returned by get_cur_path or the orphan name.
+ */
 static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
 {
        int ret = 0;
@@ -1710,39 +1760,40 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
 
        ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
                        name->start, fs_path_len(name));
-       if (ret < 0)
-               goto out;
 
 out:
        fs_path_free(sctx, name);
        return ret;
 }
 
+/*
+ * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
+ * so we need to do some special handling in case we have clashes. This function
+ * takes care of this with the help of name_cache_entry::radix_list.
+ * In case of error, nce is kfreed.
+ */
 static int name_cache_insert(struct send_ctx *sctx,
                             struct name_cache_entry *nce)
 {
        int ret = 0;
-       struct name_cache_entry **ncea;
-
-       ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
-       if (ncea) {
-               if (!ncea[0])
-                       ncea[0] = nce;
-               else if (!ncea[1])
-                       ncea[1] = nce;
-               else
-                       BUG();
-       } else {
-               ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS);
-               if (!ncea)
+       struct list_head *nce_head;
+
+       nce_head = radix_tree_lookup(&sctx->name_cache,
+                       (unsigned long)nce->ino);
+       if (!nce_head) {
+               nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
+               if (!nce_head)
                        return -ENOMEM;
+               INIT_LIST_HEAD(nce_head);
 
-               ncea[0] = nce;
-               ncea[1] = NULL;
-               ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea);
-               if (ret < 0)
+               ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
+               if (ret < 0) {
+                       kfree(nce_head);
+                       kfree(nce);
                        return ret;
+               }
        }
+       list_add_tail(&nce->radix_list, nce_head);
        list_add_tail(&nce->list, &sctx->name_cache_list);
        sctx->name_cache_size++;
 
@@ -1752,50 +1803,52 @@ static int name_cache_insert(struct send_ctx *sctx,
 static void name_cache_delete(struct send_ctx *sctx,
                              struct name_cache_entry *nce)
 {
-       struct name_cache_entry **ncea;
-
-       ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
-       BUG_ON(!ncea);
-
-       if (ncea[0] == nce)
-               ncea[0] = NULL;
-       else if (ncea[1] == nce)
-               ncea[1] = NULL;
-       else
-               BUG();
+       struct list_head *nce_head;
 
-       if (!ncea[0] && !ncea[1]) {
-               radix_tree_delete(&sctx->name_cache, nce->ino);
-               kfree(ncea);
-       }
+       nce_head = radix_tree_lookup(&sctx->name_cache,
+                       (unsigned long)nce->ino);
+       BUG_ON(!nce_head);
 
+       list_del(&nce->radix_list);
        list_del(&nce->list);
-
        sctx->name_cache_size--;
+
+       if (list_empty(nce_head)) {
+               radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
+               kfree(nce_head);
+       }
 }
 
 static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
                                                    u64 ino, u64 gen)
 {
-       struct name_cache_entry **ncea;
+       struct list_head *nce_head;
+       struct name_cache_entry *cur;
 
-       ncea = radix_tree_lookup(&sctx->name_cache, ino);
-       if (!ncea)
+       nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
+       if (!nce_head)
                return NULL;
 
-       if (ncea[0] && ncea[0]->gen == gen)
-               return ncea[0];
-       else if (ncea[1] && ncea[1]->gen == gen)
-               return ncea[1];
+       list_for_each_entry(cur, nce_head, radix_list) {
+               if (cur->ino == ino && cur->gen == gen)
+                       return cur;
+       }
        return NULL;
 }
 
+/*
+ * Removes the entry from the list and adds it back to the end. This marks the
+ * entry as recently used so that name_cache_clean_unused does not remove it.
+ */
 static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
 {
        list_del(&nce->list);
        list_add_tail(&nce->list, &sctx->name_cache_list);
 }
 
+/*
+ * Remove some entries from the beginning of name_cache_list.
+ */
 static void name_cache_clean_unused(struct send_ctx *sctx)
 {
        struct name_cache_entry *nce;
@@ -1814,13 +1867,23 @@ static void name_cache_clean_unused(struct send_ctx *sctx)
 static void name_cache_free(struct send_ctx *sctx)
 {
        struct name_cache_entry *nce;
-       struct name_cache_entry *tmp;
 
-       list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) {
+       while (!list_empty(&sctx->name_cache_list)) {
+               nce = list_entry(sctx->name_cache_list.next,
+                               struct name_cache_entry, list);
                name_cache_delete(sctx, nce);
+               kfree(nce);
        }
 }
 
+/*
+ * Used by get_cur_path for each ref up to the root.
+ * Returns 0 if it succeeded.
+ * Returns 1 if the inode is not existent or got overwritten. In that case, the
+ * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
+ * is returned, parent_ino/parent_gen are not guaranteed to be valid.
+ * Returns <0 in case of error.
+ */
 static int __get_cur_name_and_parent(struct send_ctx *sctx,
                                     u64 ino, u64 gen,
                                     u64 *parent_ino,
@@ -1832,6 +1895,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
        struct btrfs_path *path = NULL;
        struct name_cache_entry *nce = NULL;
 
+       /*
+        * First check if we already did a call to this function with the same
+        * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
+        * return the cached result.
+        */
        nce = name_cache_search(sctx, ino, gen);
        if (nce) {
                if (ino < sctx->send_progress && nce->need_later_update) {
@@ -1854,6 +1922,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
        if (!path)
                return -ENOMEM;
 
+       /*
+        * If the inode is not existent yet, add the orphan name and return 1.
+        * This should only happen for the parent dir that we determine in
+        * __record_new_ref
+        */
        ret = is_inode_existent(sctx, ino, gen);
        if (ret < 0)
                goto out;
@@ -1866,6 +1939,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
                goto out_cache;
        }
 
+       /*
+        * Depending on whether the inode was already processed or not, use
+        * send_root or parent_root for ref lookup.
+        */
        if (ino < sctx->send_progress)
                ret = get_first_ref(sctx, sctx->send_root, ino,
                                parent_ino, parent_gen, dest);
@@ -1875,6 +1952,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
        if (ret < 0)
                goto out;
 
+       /*
+        * Check if the ref was overwritten by an inode's ref that was processed
+        * earlier. If yes, treat as orphan and return 1.
+        */
        ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
                        dest->start, dest->end - dest->start);
        if (ret < 0)
@@ -1888,6 +1969,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
        }
 
 out_cache:
+       /*
+        * Store the result of the lookup in the name cache.
+        */
        nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
        if (!nce) {
                ret = -ENOMEM;
@@ -1901,7 +1985,6 @@ out_cache:
        nce->name_len = fs_path_len(dest);
        nce->ret = ret;
        strcpy(nce->name, dest->start);
-       memset(&nce->use_list, 0, sizeof(nce->use_list));
 
        if (ino < sctx->send_progress)
                nce->need_later_update = 0;
@@ -2107,9 +2190,6 @@ static int send_subvol_begin(struct send_ctx *sctx)
        read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
        btrfs_release_path(path);
 
-       if (ret < 0)
-               goto out;
-
        if (parent_root) {
                ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
                if (ret < 0)
@@ -2276,7 +2356,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
                        btrfs_inode_mtime(ii));
        TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
                        btrfs_inode_ctime(ii));
-       /* TODO otime? */
+       /* TODO Add otime support when the otime patches get into upstream */
 
        ret = send_cmd(sctx);
 
@@ -2292,39 +2372,39 @@ out:
  * a valid path yet because we did not process the refs yet. So, the inode
  * is created as orphan.
  */
-static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path,
-                            struct btrfs_key *key)
+static int send_create_inode(struct send_ctx *sctx, u64 ino)
 {
        int ret = 0;
-       struct extent_buffer *eb = path->nodes[0];
-       struct btrfs_inode_item *ii;
        struct fs_path *p;
-       int slot = path->slots[0];
        int cmd;
+       u64 gen;
        u64 mode;
+       u64 rdev;
 
-verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
+verbose_printk("btrfs: send_create_inode %llu\n", ino);
 
        p = fs_path_alloc(sctx);
        if (!p)
                return -ENOMEM;
 
-       ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
-       mode = btrfs_inode_mode(eb, ii);
+       ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL,
+                       NULL, &rdev);
+       if (ret < 0)
+               goto out;
 
-       if (S_ISREG(mode))
+       if (S_ISREG(mode)) {
                cmd = BTRFS_SEND_C_MKFILE;
-       else if (S_ISDIR(mode))
+       } else if (S_ISDIR(mode)) {
                cmd = BTRFS_SEND_C_MKDIR;
-       else if (S_ISLNK(mode))
+       } else if (S_ISLNK(mode)) {
                cmd = BTRFS_SEND_C_SYMLINK;
-       else if (S_ISCHR(mode) || S_ISBLK(mode))
+       } else if (S_ISCHR(mode) || S_ISBLK(mode)) {
                cmd = BTRFS_SEND_C_MKNOD;
-       else if (S_ISFIFO(mode))
+       } else if (S_ISFIFO(mode)) {
                cmd = BTRFS_SEND_C_MKFIFO;
-       else if (S_ISSOCK(mode))
+       } else if (S_ISSOCK(mode)) {
                cmd = BTRFS_SEND_C_MKSOCK;
-       else {
+       else {
                printk(KERN_WARNING "btrfs: unexpected inode type %o",
                                (int)(mode & S_IFMT));
                ret = -ENOTSUPP;
@@ -2335,22 +2415,22 @@ verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
        if (ret < 0)
                goto out;
 
-       ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+       ret = gen_unique_name(sctx, ino, gen, p);
        if (ret < 0)
                goto out;
 
        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
-       TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, sctx->cur_ino);
+       TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
 
        if (S_ISLNK(mode)) {
                fs_path_reset(p);
-               ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p);
+               ret = read_symlink(sctx, sctx->send_root, ino, p);
                if (ret < 0)
                        goto out;
                TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
        } else if (S_ISCHR(mode) || S_ISBLK(mode) ||
                   S_ISFIFO(mode) || S_ISSOCK(mode)) {
-               TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii));
+               TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, rdev);
        }
 
        ret = send_cmd(sctx);
@@ -2364,6 +2444,92 @@ out:
        return ret;
 }
 
+/*
+ * We need some special handling for inodes that get processed before the parent
+ * directory got created. See process_recorded_refs for details.
+ * This function does the check if we already created the dir out of order.
+ */
+static int did_create_dir(struct send_ctx *sctx, u64 dir)
+{
+       int ret = 0;
+       struct btrfs_path *path = NULL;
+       struct btrfs_key key;
+       struct btrfs_key found_key;
+       struct btrfs_key di_key;
+       struct extent_buffer *eb;
+       struct btrfs_dir_item *di;
+       int slot;
+
+       path = alloc_path_for_send();
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       key.objectid = dir;
+       key.type = BTRFS_DIR_INDEX_KEY;
+       key.offset = 0;
+       while (1) {
+               ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
+                               1, 0);
+               if (ret < 0)
+                       goto out;
+               if (!ret) {
+                       eb = path->nodes[0];
+                       slot = path->slots[0];
+                       btrfs_item_key_to_cpu(eb, &found_key, slot);
+               }
+               if (ret || found_key.objectid != key.objectid ||
+                   found_key.type != key.type) {
+                       ret = 0;
+                       goto out;
+               }
+
+               di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+               btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+
+               if (di_key.objectid < sctx->send_progress) {
+                       ret = 1;
+                       goto out;
+               }
+
+               key.offset = found_key.offset + 1;
+               btrfs_release_path(path);
+       }
+
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+/*
+ * Only creates the inode if it is:
+ * 1. Not a directory
+ * 2. Or a directory which was not created already due to out of order
+ *    directories. See did_create_dir and process_recorded_refs for details.
+ */
+static int send_create_inode_if_needed(struct send_ctx *sctx)
+{
+       int ret;
+
+       if (S_ISDIR(sctx->cur_inode_mode)) {
+               ret = did_create_dir(sctx, sctx->cur_ino);
+               if (ret < 0)
+                       goto out;
+               if (ret) {
+                       ret = 0;
+                       goto out;
+               }
+       }
+
+       ret = send_create_inode(sctx, sctx->cur_ino);
+       if (ret < 0)
+               goto out;
+
+out:
+       return ret;
+}
+
 struct recorded_ref {
        struct list_head list;
        char *dir_path;
@@ -2416,13 +2582,13 @@ static int record_ref(struct list_head *head, u64 dir,
 static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
 {
        struct recorded_ref *cur;
-       struct recorded_ref *tmp;
 
-       list_for_each_entry_safe(cur, tmp, head, list) {
+       while (!list_empty(head)) {
+               cur = list_entry(head->next, struct recorded_ref, list);
                fs_path_free(sctx, cur->full_path);
+               list_del(&cur->list);
                kfree(cur);
        }
-       INIT_LIST_HEAD(head);
 }
 
 static void free_recorded_refs(struct send_ctx *sctx)
@@ -2432,7 +2598,7 @@ static void free_recorded_refs(struct send_ctx *sctx)
 }
 
 /*
- * Renames/moves a file/dir to it's orphan name. Used when the first
+ * Renames/moves a file/dir to its orphan name. Used when the first
  * ref of an unprocessed inode gets overwritten and for all non empty
  * directories.
  */
@@ -2472,6 +2638,12 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
        struct btrfs_key loc;
        struct btrfs_dir_item *di;
 
+       /*
+        * Don't try to rmdir the top/root subvolume dir.
+        */
+       if (dir == BTRFS_FIRST_FREE_OBJECTID)
+               return 0;
+
        path = alloc_path_for_send();
        if (!path)
                return -ENOMEM;
@@ -2513,160 +2685,6 @@ out:
        return ret;
 }
 
-struct finish_unordered_dir_ctx {
-       struct send_ctx *sctx;
-       struct fs_path *cur_path;
-       struct fs_path *dir_path;
-       u64 dir_ino;
-       int need_delete;
-       int delete_pass;
-};
-
-int __finish_unordered_dir(int num, struct btrfs_key *di_key,
-                          const char *name, int name_len,
-                          const char *data, int data_len,
-                          u8 type, void *ctx)
-{
-       int ret = 0;
-       struct finish_unordered_dir_ctx *fctx = ctx;
-       struct send_ctx *sctx = fctx->sctx;
-       u64 di_gen;
-       u64 di_mode;
-       int is_orphan = 0;
-
-       if (di_key->objectid >= fctx->dir_ino)
-               goto out;
-
-       fs_path_reset(fctx->cur_path);
-
-       ret = get_inode_info(sctx->send_root, di_key->objectid,
-                       NULL, &di_gen, &di_mode, NULL, NULL);
-       if (ret < 0)
-               goto out;
-
-       ret = is_first_ref(sctx, sctx->send_root, di_key->objectid,
-                       fctx->dir_ino, name, name_len);
-       if (ret < 0)
-               goto out;
-       if (ret) {
-               is_orphan = 1;
-               ret = gen_unique_name(sctx, di_key->objectid, di_gen,
-                               fctx->cur_path);
-       } else {
-               ret = get_cur_path(sctx, di_key->objectid, di_gen,
-                               fctx->cur_path);
-       }
-       if (ret < 0)
-               goto out;
-
-       ret = fs_path_add(fctx->dir_path, name, name_len);
-       if (ret < 0)
-               goto out;
-
-       if (!fctx->delete_pass) {
-               if (S_ISDIR(di_mode)) {
-                       ret = send_rename(sctx, fctx->cur_path,
-                                       fctx->dir_path);
-               } else {
-                       ret = send_link(sctx, fctx->dir_path,
-                                       fctx->cur_path);
-                       if (is_orphan)
-                               fctx->need_delete = 1;
-               }
-       } else if (!S_ISDIR(di_mode)) {
-               ret = send_unlink(sctx, fctx->cur_path);
-       } else {
-               ret = 0;
-       }
-
-       fs_path_remove(fctx->dir_path);
-
-out:
-       return ret;
-}
-
-/*
- * Go through all dir items and see if we find refs which could not be created
- * in the past because the dir did not exist at that time.
- */
-static int finish_outoforder_dir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
-{
-       int ret = 0;
-       struct btrfs_path *path = NULL;
-       struct btrfs_key key;
-       struct btrfs_key found_key;
-       struct extent_buffer *eb;
-       struct finish_unordered_dir_ctx fctx;
-       int slot;
-
-       path = alloc_path_for_send();
-       if (!path) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       memset(&fctx, 0, sizeof(fctx));
-       fctx.sctx = sctx;
-       fctx.cur_path = fs_path_alloc(sctx);
-       fctx.dir_path = fs_path_alloc(sctx);
-       if (!fctx.cur_path || !fctx.dir_path) {
-               ret = -ENOMEM;
-               goto out;
-       }
-       fctx.dir_ino = dir;
-
-       ret = get_cur_path(sctx, dir, dir_gen, fctx.dir_path);
-       if (ret < 0)
-               goto out;
-
-       /*
-        * We do two passes. The first links in the new refs and the second
-        * deletes orphans if required. Deletion of orphans is not required for
-        * directory inodes, as we always have only one ref and use rename
-        * instead of link for those.
-        */
-
-again:
-       key.objectid = dir;
-       key.type = BTRFS_DIR_ITEM_KEY;
-       key.offset = 0;
-       while (1) {
-               ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
-                               1, 0);
-               if (ret < 0)
-                       goto out;
-               eb = path->nodes[0];
-               slot = path->slots[0];
-               btrfs_item_key_to_cpu(eb, &found_key, slot);
-
-               if (found_key.objectid != key.objectid ||
-                   found_key.type != key.type) {
-                       btrfs_release_path(path);
-                       break;
-               }
-
-               ret = iterate_dir_item(sctx, sctx->send_root, path,
-                               &found_key, __finish_unordered_dir,
-                               &fctx);
-               if (ret < 0)
-                       goto out;
-
-               key.offset = found_key.offset + 1;
-               btrfs_release_path(path);
-       }
-
-       if (!fctx.delete_pass && fctx.need_delete) {
-               fctx.delete_pass = 1;
-               goto again;
-       }
-
-out:
-       btrfs_free_path(path);
-       fs_path_free(sctx, fctx.cur_path);
-       fs_path_free(sctx, fctx.dir_path);
-       return ret;
-}
-
 /*
  * This does all the move/link/unlink/rmdir magic.
  */
@@ -2674,6 +2692,7 @@ static int process_recorded_refs(struct send_ctx *sctx)
 {
        int ret = 0;
        struct recorded_ref *cur;
+       struct recorded_ref *cur2;
        struct ulist *check_dirs = NULL;
        struct ulist_iterator uit;
        struct ulist_node *un;
@@ -2685,6 +2704,12 @@ static int process_recorded_refs(struct send_ctx *sctx)
 
 verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 
+       /*
+        * This should never happen as the root dir always has the same ref
+        * which is always '..'
+        */
+       BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
+
        valid_path = fs_path_alloc(sctx);
        if (!valid_path) {
                ret = -ENOMEM;
@@ -2730,6 +2755,46 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
        }
 
        list_for_each_entry(cur, &sctx->new_refs, list) {
+               /*
+                * We may have refs where the parent directory does not exist
+                * yet. This happens if the parent directories inum is higher
+                * the the current inum. To handle this case, we create the
+                * parent directory out of order. But we need to check if this
+                * did already happen before due to other refs in the same dir.
+                */
+               ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+               if (ret < 0)
+                       goto out;
+               if (ret == inode_state_will_create) {
+                       ret = 0;
+                       /*
+                        * First check if any of the current inodes refs did
+                        * already create the dir.
+                        */
+                       list_for_each_entry(cur2, &sctx->new_refs, list) {
+                               if (cur == cur2)
+                                       break;
+                               if (cur2->dir == cur->dir) {
+                                       ret = 1;
+                                       break;
+                               }
+                       }
+
+                       /*
+                        * If that did not happen, check if a previous inode
+                        * did already create the dir.
+                        */
+                       if (!ret)
+                               ret = did_create_dir(sctx, cur->dir);
+                       if (ret < 0)
+                               goto out;
+                       if (!ret) {
+                               ret = send_create_inode(sctx, cur->dir);
+                               if (ret < 0)
+                                       goto out;
+                       }
+               }
+
                /*
                 * Check if this new ref would overwrite the first ref of
                 * another unprocessed inode. If yes, orphanize the
@@ -2764,7 +2829,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                 * inode, move it and update valid_path. If not, link or move
                 * it depending on the inode mode.
                 */
-               if (is_orphan && !sctx->cur_inode_first_ref_orphan) {
+               if (is_orphan) {
                        ret = send_rename(sctx, valid_path, cur->full_path);
                        if (ret < 0)
                                goto out;
@@ -2827,6 +2892,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                        if (ret < 0)
                                goto out;
                }
+       } else if (S_ISDIR(sctx->cur_inode_mode) &&
+                  !list_empty(&sctx->deleted_refs)) {
+               /*
+                * We have a moved dir. Add the old parent to check_dirs
+                */
+               cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
+                               list);
+               ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
+                               GFP_NOFS);
+               if (ret < 0)
+                       goto out;
        } else if (!S_ISDIR(sctx->cur_inode_mode)) {
                /*
                 * We have a non dir inode. Go through all deleted refs and
@@ -2840,35 +2916,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                        if (ret < 0)
                                goto out;
                        if (!ret) {
-                               /*
-                                * In case the inode was moved to a directory
-                                * that was not created yet (see
-                                * __record_new_ref), we can not unlink the ref
-                                * as it will be needed later when the parent
-                                * directory is created, so that we can move in
-                                * the inode to the new dir.
-                                */
-                               if (!is_orphan &&
-                                   sctx->cur_inode_first_ref_orphan) {
-                                       ret = orphanize_inode(sctx,
-                                                       sctx->cur_ino,
-                                                       sctx->cur_inode_gen,
-                                                       cur->full_path);
-                                       if (ret < 0)
-                                               goto out;
-                                       ret = gen_unique_name(sctx,
-                                                       sctx->cur_ino,
-                                                       sctx->cur_inode_gen,
-                                                       valid_path);
-                                       if (ret < 0)
-                                               goto out;
-                                       is_orphan = 1;
-
-                               } else {
-                                       ret = send_unlink(sctx, cur->full_path);
-                                       if (ret < 0)
-                                               goto out;
-                               }
+                               ret = send_unlink(sctx, cur->full_path);
+                               if (ret < 0)
+                                       goto out;
                        }
                        ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
                                        GFP_NOFS);
@@ -2880,12 +2930,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                 * If the inode is still orphan, unlink the orphan. This may
                 * happen when a previous inode did overwrite the first ref
                 * of this inode and no new refs were added for the current
-                * inode.
-                * We can however not delete the orphan in case the inode relies
-                * in a directory that was not created yet (see
-                * __record_new_ref)
+                * inode. Unlinking does not mean that the inode is deleted in
+                * all cases. There may still be links to this inode in other
+                * places.
                 */
-               if (is_orphan && !sctx->cur_inode_first_ref_orphan) {
+               if (is_orphan) {
                        ret = send_unlink(sctx, valid_path);
                        if (ret < 0)
                                goto out;
@@ -2900,6 +2949,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
         */
        ULIST_ITER_INIT(&uit);
        while ((un = ulist_next(check_dirs, &uit))) {
+               /*
+                * In case we had refs into dirs that were not processed yet,
+                * we don't need to do the utime and rmdir logic for these dirs.
+                * The dir will be processed later.
+                */
                if (un->val > sctx->cur_ino)
                        continue;
 
@@ -2929,25 +2983,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                }
        }
 
-       /*
-        * Current inode is now at it's new position, so we must increase
-        * send_progress
-        */
-       sctx->send_progress = sctx->cur_ino + 1;
-
-       /*
-        * We may have a directory here that has pending refs which could not
-        * be created before (because the dir did not exist before, see
-        * __record_new_ref). finish_outoforder_dir will link/move the pending
-        * refs.
-        */
-       if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_new) {
-               ret = finish_outoforder_dir(sctx, sctx->cur_ino,
-                               sctx->cur_inode_gen);
-               if (ret < 0)
-                       goto out;
-       }
-
        ret = 0;
 
 out:
@@ -2971,34 +3006,9 @@ static int __record_new_ref(int num, u64 dir, int index,
                return -ENOMEM;
 
        ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
-                       NULL);
-       if (ret < 0)
-               goto out;
-
-       /*
-        * The parent may be non-existent at this point in time. This happens
-        * if the ino of the parent dir is higher then the current ino. In this
-        * case, we can not process this ref until the parent dir is finally
-        * created. If we reach the parent dir later, process_recorded_refs
-        * will go through all dir items and process the refs that could not be
-        * processed before. In case this is the first ref, we set
-        * cur_inode_first_ref_orphan to 1 to inform process_recorded_refs to
-        * keep an orphan of the inode so that it later can be used for
-        * link/move
-        */
-       ret = is_inode_existent(sctx, dir, gen);
+                       NULL, NULL);
        if (ret < 0)
                goto out;
-       if (!ret) {
-               ret = is_first_ref(sctx, sctx->send_root, sctx->cur_ino, dir,
-                               name->start, fs_path_len(name));
-               if (ret < 0)
-                       goto out;
-               if (ret)
-                       sctx->cur_inode_first_ref_orphan = 1;
-               ret = 0;
-               goto out;
-       }
 
        ret = get_cur_path(sctx, dir, gen, p);
        if (ret < 0)
@@ -3029,7 +3039,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
                return -ENOMEM;
 
        ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
-                       NULL);
+                       NULL, NULL);
        if (ret < 0)
                goto out;
 
@@ -3206,33 +3216,28 @@ static int process_all_refs(struct send_ctx *sctx,
        key.offset = 0;
        while (1) {
                ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-               if (ret < 0) {
-                       btrfs_release_path(path);
+               if (ret < 0)
                        goto out;
-               }
-               if (ret) {
-                       btrfs_release_path(path);
+               if (ret)
                        break;
-               }
 
                eb = path->nodes[0];
                slot = path->slots[0];
                btrfs_item_key_to_cpu(eb, &found_key, slot);
 
                if (found_key.objectid != key.objectid ||
-                   found_key.type != key.type) {
-                       btrfs_release_path(path);
+                   found_key.type != key.type)
                        break;
-               }
 
-               ret = iterate_inode_ref(sctx, sctx->parent_root, path,
-                               &found_key, 0, cb, sctx);
+               ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb,
+                               sctx);
                btrfs_release_path(path);
                if (ret < 0)
                        goto out;
 
                key.offset = found_key.offset + 1;
        }
+       btrfs_release_path(path);
 
        ret = process_recorded_refs(sctx);
 
@@ -3555,7 +3560,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
        int ret = 0;
        struct fs_path *p;
        loff_t pos = offset;
-       int readed = 0;
+       int num_read = 0;
        mm_segment_t old_fs;
 
        p = fs_path_alloc(sctx);
@@ -3580,8 +3585,8 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
        ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
        if (ret < 0)
                goto out;
-       readed = ret;
-       if (!readed)
+       num_read = ret;
+       if (!num_read)
                goto out;
 
        ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
@@ -3594,7 +3599,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
 
        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
        TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
-       TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed);
+       TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
 
        ret = send_cmd(sctx);
 
@@ -3604,7 +3609,7 @@ out:
        set_fs(old_fs);
        if (ret < 0)
                return ret;
-       return readed;
+       return num_read;
 }
 
 /*
@@ -3615,7 +3620,6 @@ static int send_clone(struct send_ctx *sctx,
                      struct clone_root *clone_root)
 {
        int ret = 0;
-       struct btrfs_root *clone_root2 = clone_root->root;
        struct fs_path *p;
        u64 gen;
 
@@ -3640,22 +3644,23 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
        TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
 
-       if (clone_root2 == sctx->send_root) {
+       if (clone_root->root == sctx->send_root) {
                ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
-                               &gen, NULL, NULL, NULL);
+                               &gen, NULL, NULL, NULL, NULL);
                if (ret < 0)
                        goto out;
                ret = get_cur_path(sctx, clone_root->ino, gen, p);
        } else {
-               ret = get_inode_path(sctx, clone_root2, clone_root->ino, p);
+               ret = get_inode_path(sctx, clone_root->root,
+                               clone_root->ino, p);
        }
        if (ret < 0)
                goto out;
 
        TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
-                       clone_root2->root_item.uuid);
+                       clone_root->root->root_item.uuid);
        TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
-                       clone_root2->root_item.ctransid);
+                       clone_root->root->root_item.ctransid);
        TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
        TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
                        clone_root->offset);
@@ -3684,10 +3689,17 @@ static int send_write_or_clone(struct send_ctx *sctx,
        ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
                        struct btrfs_file_extent_item);
        type = btrfs_file_extent_type(path->nodes[0], ei);
-       if (type == BTRFS_FILE_EXTENT_INLINE)
+       if (type == BTRFS_FILE_EXTENT_INLINE) {
                len = btrfs_file_extent_inline_len(path->nodes[0], ei);
-       else
+               /*
+                * it is possible the inline item won't cover the whole page,
+                * but there may be items after this page.  Make
+                * sure to send the whole thing
+                */
+               len = PAGE_CACHE_ALIGN(len);
+       } else {
                len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
+       }
 
        if (offset + len > sctx->cur_inode_size)
                len = sctx->cur_inode_size - offset;
@@ -3735,6 +3747,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
        u64 left_offset_fixed;
        u64 left_len;
        u64 right_len;
+       u64 left_gen;
+       u64 right_gen;
        u8 left_type;
        u8 right_type;
 
@@ -3744,17 +3758,17 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 
        eb = left_path->nodes[0];
        slot = left_path->slots[0];
-
        ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
        left_type = btrfs_file_extent_type(eb, ei);
-       left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
-       left_len = btrfs_file_extent_num_bytes(eb, ei);
-       left_offset = btrfs_file_extent_offset(eb, ei);
 
        if (left_type != BTRFS_FILE_EXTENT_REG) {
                ret = 0;
                goto out;
        }
+       left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+       left_len = btrfs_file_extent_num_bytes(eb, ei);
+       left_offset = btrfs_file_extent_offset(eb, ei);
+       left_gen = btrfs_file_extent_generation(eb, ei);
 
        /*
         * Following comments will refer to these graphics. L is the left
@@ -3810,6 +3824,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
                right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
                right_len = btrfs_file_extent_num_bytes(eb, ei);
                right_offset = btrfs_file_extent_offset(eb, ei);
+               right_gen = btrfs_file_extent_generation(eb, ei);
 
                if (right_type != BTRFS_FILE_EXTENT_REG) {
                        ret = 0;
@@ -3820,7 +3835,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
                 * Are we at extent 8? If yes, we know the extent is changed.
                 * This may only happen on the first iteration.
                 */
-               if (found_key.offset + right_len < ekey->offset) {
+               if (found_key.offset + right_len <= ekey->offset) {
                        ret = 0;
                        goto out;
                }
@@ -3837,8 +3852,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
                /*
                 * Check if we have the same extent.
                 */
-               if (left_disknr + left_offset_fixed !=
-                               right_disknr + right_offset) {
+               if (left_disknr != right_disknr ||
+                   left_offset_fixed != right_offset ||
+                   left_gen != right_gen) {
                        ret = 0;
                        goto out;
                }
@@ -3977,6 +3993,15 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
                goto out;
 
        ret = process_recorded_refs(sctx);
+       if (ret < 0)
+               goto out;
+
+       /*
+        * We have processed the refs and thus need to advance send_progress.
+        * Now, calls to get_cur_xxx will take the updated refs of the current
+        * inode into account.
+        */
+       sctx->send_progress = sctx->cur_ino + 1;
 
 out:
        return ret;
@@ -4004,7 +4029,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
                goto out;
 
        ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
-                       &left_mode, &left_uid, &left_gid);
+                       &left_mode, &left_uid, &left_gid, NULL);
        if (ret < 0)
                goto out;
 
@@ -4015,7 +4040,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
                } else {
                        ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
                                        NULL, NULL, &right_mode, &right_uid,
-                                       &right_gid);
+                                       &right_gid, NULL);
                        if (ret < 0)
                                goto out;
 
@@ -4074,7 +4099,12 @@ static int changed_inode(struct send_ctx *sctx,
 
        sctx->cur_ino = key->objectid;
        sctx->cur_inode_new_gen = 0;
-       sctx->cur_inode_first_ref_orphan = 0;
+
+       /*
+        * Set send_progress to current inode. This will tell all get_cur_xxx
+        * functions that the current inode's refs are not updated yet. Later,
+        * when process_recorded_refs is finished, it is set to cur_ino + 1.
+        */
        sctx->send_progress = sctx->cur_ino;
 
        if (result == BTRFS_COMPARE_TREE_NEW ||
@@ -4098,7 +4128,14 @@ static int changed_inode(struct send_ctx *sctx,
 
                right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
                                right_ii);
-               if (left_gen != right_gen)
+
+               /*
+                * The cur_ino = root dir case is special here. We can't treat
+                * the inode as deleted+reused because it would generate a
+                * stream that tries to delete/mkdir the root dir.
+                */
+               if (left_gen != right_gen &&
+                   sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
                        sctx->cur_inode_new_gen = 1;
        }
 
@@ -4111,8 +4148,7 @@ static int changed_inode(struct send_ctx *sctx,
                sctx->cur_inode_mode = btrfs_inode_mode(
                                sctx->left_path->nodes[0], left_ii);
                if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
-                       ret = send_create_inode(sctx, sctx->left_path,
-                                       sctx->cmp_key);
+                       ret = send_create_inode_if_needed(sctx);
        } else if (result == BTRFS_COMPARE_TREE_DELETED) {
                sctx->cur_inode_gen = right_gen;
                sctx->cur_inode_new = 0;
@@ -4122,7 +4158,17 @@ static int changed_inode(struct send_ctx *sctx,
                sctx->cur_inode_mode = btrfs_inode_mode(
                                sctx->right_path->nodes[0], right_ii);
        } else if (result == BTRFS_COMPARE_TREE_CHANGED) {
+               /*
+                * We need to do some special handling in case the inode was
+                * reported as changed with a changed generation number. This
+                * means that the original inode was deleted and new inode
+                * reused the same inum. So we have to treat the old inode as
+                * deleted and the new one as new.
+                */
                if (sctx->cur_inode_new_gen) {
+                       /*
+                        * First, process the inode as if it was deleted.
+                        */
                        sctx->cur_inode_gen = right_gen;
                        sctx->cur_inode_new = 0;
                        sctx->cur_inode_deleted = 1;
@@ -4135,6 +4181,9 @@ static int changed_inode(struct send_ctx *sctx,
                        if (ret < 0)
                                goto out;
 
+                       /*
+                        * Now process the inode as if it was new.
+                        */
                        sctx->cur_inode_gen = left_gen;
                        sctx->cur_inode_new = 1;
                        sctx->cur_inode_deleted = 0;
@@ -4142,14 +4191,23 @@ static int changed_inode(struct send_ctx *sctx,
                                        sctx->left_path->nodes[0], left_ii);
                        sctx->cur_inode_mode = btrfs_inode_mode(
                                        sctx->left_path->nodes[0], left_ii);
-                       ret = send_create_inode(sctx, sctx->left_path,
-                                       sctx->cmp_key);
+                       ret = send_create_inode_if_needed(sctx);
                        if (ret < 0)
                                goto out;
 
                        ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
                        if (ret < 0)
                                goto out;
+                       /*
+                        * Advance send_progress now as we did not get into
+                        * process_recorded_refs_if_needed in the new_gen case.
+                        */
+                       sctx->send_progress = sctx->cur_ino + 1;
+
+                       /*
+                        * Now process all extents and xattrs of the inode as if
+                        * they were all new.
+                        */
                        ret = process_all_extents(sctx);
                        if (ret < 0)
                                goto out;
@@ -4172,6 +4230,16 @@ out:
        return ret;
 }
 
+/*
+ * We have to process new refs before deleted refs, but compare_trees gives us
+ * the new and deleted refs mixed. To fix this, we record the new/deleted refs
+ * first and later process them in process_recorded_refs.
+ * For the cur_inode_new_gen case, we skip recording completely because
+ * changed_inode did already initiate processing of refs. The reason for this is
+ * that in this case, compare_tree actually compares the refs of 2 different
+ * inodes. To fix this, process_all_refs is used in changed_inode to handle all
+ * refs of the right tree as deleted and all refs of the left tree as new.
+ */
 static int changed_ref(struct send_ctx *sctx,
                       enum btrfs_compare_tree_result result)
 {
@@ -4192,6 +4260,11 @@ static int changed_ref(struct send_ctx *sctx,
        return ret;
 }
 
+/*
+ * Process new/deleted/changed xattrs. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of xattrs. The reason is the same as in changed_ref
+ */
 static int changed_xattr(struct send_ctx *sctx,
                         enum btrfs_compare_tree_result result)
 {
@@ -4211,6 +4284,11 @@ static int changed_xattr(struct send_ctx *sctx,
        return ret;
 }
 
+/*
+ * Process new/deleted/changed extents. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of extents. The reason is the same as in changed_ref
+ */
 static int changed_extent(struct send_ctx *sctx,
                          enum btrfs_compare_tree_result result)
 {
@@ -4227,7 +4305,10 @@ static int changed_extent(struct send_ctx *sctx,
        return ret;
 }
 
-
+/*
+ * Updates compare related fields in sctx and simply forwards to the actual
+ * changed_xxx functions.
+ */
 static int changed_cb(struct btrfs_root *left_root,
                      struct btrfs_root *right_root,
                      struct btrfs_path *left_path,
@@ -4247,6 +4328,11 @@ static int changed_cb(struct btrfs_root *left_root,
        if (ret < 0)
                goto out;
 
+       /* Ignore non-FS objects */
+       if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
+           key->objectid == BTRFS_FREE_SPACE_OBJECTID)
+               goto out;
+
        if (key->type == BTRFS_INODE_ITEM_KEY)
                ret = changed_inode(sctx, result);
        else if (key->type == BTRFS_INODE_REF_KEY)
@@ -4299,7 +4385,8 @@ join_trans:
        }
 
        /*
-        * Make sure the tree has not changed
+        * Make sure the tree has not changed after re-joining. We detect this
+        * by comparing start_ctransid and ctransid. They should always match.
         */
        spin_lock(&send_root->root_times_lock);
        ctransid = btrfs_root_ctransid(&send_root->root_item);
index 9934e948e57f91067f5a5e3f9cdf9720939af151..1bf4f32fd4ef28582c6d2401571bf2f7f37617a6 100644 (file)
@@ -130,4 +130,5 @@ enum {
 
 #ifdef __KERNEL__
 long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
+int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off);
 #endif
index 83d6f9f9c2209861efdec86dec9ad54d629deeb9..915ac14c20642ec619ec159d90d0fbed2a02b0da 100644 (file)
@@ -243,12 +243,18 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root, const char *function,
                               unsigned int line, int errno)
 {
-       WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted");
+       WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted\n");
        trans->aborted = errno;
        /* Nothing used. The other threads that have joined this
         * transaction may be able to continue. */
        if (!trans->blocks_used) {
-               btrfs_printk(root->fs_info, "Aborting unused transaction.\n");
+               char nbuf[16];
+               const char *errstr;
+
+               errstr = btrfs_decode_error(root->fs_info, errno, nbuf);
+               btrfs_printk(root->fs_info,
+                            "%s:%d: Aborting unused transaction(%s).\n",
+                            function, line, errstr);
                return;
        }
        trans->transaction->aborted = errno;
@@ -407,7 +413,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        btrfs_set_opt(info->mount_opt, NODATASUM);
                        break;
                case Opt_nodatacow:
-                       printk(KERN_INFO "btrfs: setting nodatacow\n");
+                       if (!btrfs_test_opt(root, COMPRESS) ||
+                               !btrfs_test_opt(root, FORCE_COMPRESS)) {
+                                       printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n");
+                       } else {
+                               printk(KERN_INFO "btrfs: setting nodatacow\n");
+                       }
+                       info->compress_type = BTRFS_COMPRESS_NONE;
+                       btrfs_clear_opt(info->mount_opt, COMPRESS);
+                       btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
                        btrfs_set_opt(info->mount_opt, NODATACOW);
                        btrfs_set_opt(info->mount_opt, NODATASUM);
                        break;
@@ -422,10 +436,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                compress_type = "zlib";
                                info->compress_type = BTRFS_COMPRESS_ZLIB;
                                btrfs_set_opt(info->mount_opt, COMPRESS);
+                               btrfs_clear_opt(info->mount_opt, NODATACOW);
+                               btrfs_clear_opt(info->mount_opt, NODATASUM);
                        } else if (strcmp(args[0].from, "lzo") == 0) {
                                compress_type = "lzo";
                                info->compress_type = BTRFS_COMPRESS_LZO;
                                btrfs_set_opt(info->mount_opt, COMPRESS);
+                               btrfs_clear_opt(info->mount_opt, NODATACOW);
+                               btrfs_clear_opt(info->mount_opt, NODATASUM);
                                btrfs_set_fs_incompat(info, COMPRESS_LZO);
                        } else if (strncmp(args[0].from, "no", 2) == 0) {
                                compress_type = "no";
@@ -543,11 +561,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
                        break;
                case Opt_defrag:
-                       printk(KERN_INFO "btrfs: enabling auto defrag");
+                       printk(KERN_INFO "btrfs: enabling auto defrag\n");
                        btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
                        break;
                case Opt_recovery:
-                       printk(KERN_INFO "btrfs: enabling auto recovery");
+                       printk(KERN_INFO "btrfs: enabling auto recovery\n");
                        btrfs_set_opt(info->mount_opt, RECOVERY);
                        break;
                case Opt_skip_balance:
@@ -846,18 +864,15 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
                return 0;
        }
 
-       btrfs_wait_ordered_extents(root, 0, 0);
-
-       spin_lock(&fs_info->trans_lock);
-       if (!fs_info->running_transaction) {
-               spin_unlock(&fs_info->trans_lock);
-               return 0;
-       }
-       spin_unlock(&fs_info->trans_lock);
+       btrfs_wait_ordered_extents(root, 0);
 
-       trans = btrfs_join_transaction(root);
-       if (IS_ERR(trans))
+       trans = btrfs_attach_transaction(root);
+       if (IS_ERR(trans)) {
+               /* no transaction, don't bother */
+               if (PTR_ERR(trans) == -ENOENT)
+                       return 0;
                return PTR_ERR(trans);
+       }
        return btrfs_commit_transaction(trans, root);
 }
 
@@ -1508,17 +1523,21 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 
 static int btrfs_freeze(struct super_block *sb)
 {
-       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-       mutex_lock(&fs_info->transaction_kthread_mutex);
-       mutex_lock(&fs_info->cleaner_mutex);
-       return 0;
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+
+       trans = btrfs_attach_transaction(root);
+       if (IS_ERR(trans)) {
+               /* no transaction, don't bother */
+               if (PTR_ERR(trans) == -ENOENT)
+                       return 0;
+               return PTR_ERR(trans);
+       }
+       return btrfs_commit_transaction(trans, root);
 }
 
 static int btrfs_unfreeze(struct super_block *sb)
 {
-       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-       mutex_unlock(&fs_info->cleaner_mutex);
-       mutex_unlock(&fs_info->transaction_kthread_mutex);
        return 0;
 }
 
@@ -1595,7 +1614,7 @@ static int btrfs_interface_init(void)
 static void btrfs_interface_exit(void)
 {
        if (misc_deregister(&btrfs_misc) < 0)
-               printk(KERN_INFO "misc_deregister failed for control device");
+               printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
 }
 
 static int __init init_btrfs_fs(void)
@@ -1620,10 +1639,14 @@ static int __init init_btrfs_fs(void)
        if (err)
                goto free_extent_io;
 
-       err = btrfs_delayed_inode_init();
+       err = ordered_data_init();
        if (err)
                goto free_extent_map;
 
+       err = btrfs_delayed_inode_init();
+       if (err)
+               goto free_ordered_data;
+
        err = btrfs_interface_init();
        if (err)
                goto free_delayed_inode;
@@ -1641,6 +1664,8 @@ unregister_ioctl:
        btrfs_interface_exit();
 free_delayed_inode:
        btrfs_delayed_inode_exit();
+free_ordered_data:
+       ordered_data_exit();
 free_extent_map:
        extent_map_exit();
 free_extent_io:
@@ -1657,6 +1682,7 @@ static void __exit exit_btrfs_fs(void)
 {
        btrfs_destroy_cachep();
        btrfs_delayed_inode_exit();
+       ordered_data_exit();
        extent_map_exit();
        extent_io_exit();
        btrfs_interface_exit();
index 27c26004e050a33211674363cfd80c02c98d1063..77db875b511638b7ff94854c6b1482942da2b3fb 100644 (file)
@@ -53,7 +53,7 @@ static noinline void switch_commit_root(struct btrfs_root *root)
 /*
  * either allocate a new transaction or hop into the existing one
  */
-static noinline int join_transaction(struct btrfs_root *root, int nofail)
+static noinline int join_transaction(struct btrfs_root *root, int type)
 {
        struct btrfs_transaction *cur_trans;
        struct btrfs_fs_info *fs_info = root->fs_info;
@@ -67,7 +67,13 @@ loop:
        }
 
        if (fs_info->trans_no_join) {
-               if (!nofail) {
+               /* 
+                * If we are JOIN_NOLOCK we're already committing a current
+                * transaction, we just need a handle to deal with something
+                * when committing the transaction, such as inode cache and
+                * space cache. It is a special case.
+                */
+               if (type != TRANS_JOIN_NOLOCK) {
                        spin_unlock(&fs_info->trans_lock);
                        return -EBUSY;
                }
@@ -87,6 +93,13 @@ loop:
        }
        spin_unlock(&fs_info->trans_lock);
 
+       /*
+        * If we are ATTACH, we just want to catch the current transaction,
+        * and commit it. If there is no transaction, just return ENOENT.
+        */
+       if (type == TRANS_ATTACH)
+               return -ENOENT;
+
        cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
        if (!cur_trans)
                return -ENOMEM;
@@ -267,13 +280,6 @@ static void wait_current_trans(struct btrfs_root *root)
        }
 }
 
-enum btrfs_trans_type {
-       TRANS_START,
-       TRANS_JOIN,
-       TRANS_USERSPACE,
-       TRANS_JOIN_NOLOCK,
-};
-
 static int may_wait_transaction(struct btrfs_root *root, int type)
 {
        if (root->fs_info->log_root_recovering)
@@ -290,7 +296,8 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
 }
 
 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
-                                                   u64 num_items, int type)
+                                                   u64 num_items, int type,
+                                                   int noflush)
 {
        struct btrfs_trans_handle *h;
        struct btrfs_transaction *cur_trans;
@@ -324,9 +331,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
                }
 
                num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
-               ret = btrfs_block_rsv_add(root,
-                                         &root->fs_info->trans_block_rsv,
-                                         num_bytes);
+               if (noflush)
+                       ret = btrfs_block_rsv_add_noflush(root,
+                                               &root->fs_info->trans_block_rsv,
+                                               num_bytes);
+               else
+                       ret = btrfs_block_rsv_add(root,
+                                               &root->fs_info->trans_block_rsv,
+                                               num_bytes);
                if (ret)
                        return ERR_PTR(ret);
        }
@@ -335,19 +347,34 @@ again:
        if (!h)
                return ERR_PTR(-ENOMEM);
 
-       sb_start_intwrite(root->fs_info->sb);
+       /*
+        * If we are JOIN_NOLOCK we're already committing a transaction and
+        * waiting on this guy, so we don't need to do the sb_start_intwrite
+        * because we're already holding a ref.  We need this because we could
+        * have raced in and did an fsync() on a file which can kick a commit
+        * and then we deadlock with somebody doing a freeze.
+        *
+        * If we are ATTACH, it means we just want to catch the current
+        * transaction and commit it, so we needn't do sb_start_intwrite(). 
+        */
+       if (type < TRANS_JOIN_NOLOCK)
+               sb_start_intwrite(root->fs_info->sb);
 
        if (may_wait_transaction(root, type))
                wait_current_trans(root);
 
        do {
-               ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
+               ret = join_transaction(root, type);
                if (ret == -EBUSY)
                        wait_current_trans(root);
        } while (ret == -EBUSY);
 
        if (ret < 0) {
-               sb_end_intwrite(root->fs_info->sb);
+               /* We must get the transaction if we are JOIN_NOLOCK. */
+               BUG_ON(type == TRANS_JOIN_NOLOCK);
+
+               if (type < TRANS_JOIN_NOLOCK)
+                       sb_end_intwrite(root->fs_info->sb);
                kmem_cache_free(btrfs_trans_handle_cachep, h);
                return ERR_PTR(ret);
        }
@@ -367,7 +394,9 @@ again:
        h->aborted = 0;
        h->qgroup_reserved = qgroup_reserved;
        h->delayed_ref_elem.seq = 0;
+       h->type = type;
        INIT_LIST_HEAD(&h->qgroup_ref_list);
+       INIT_LIST_HEAD(&h->new_bgs);
 
        smp_mb();
        if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -393,21 +422,33 @@ got_it:
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                   int num_items)
 {
-       return start_transaction(root, num_items, TRANS_START);
+       return start_transaction(root, num_items, TRANS_START, 0);
+}
+
+struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+                                       struct btrfs_root *root, int num_items)
+{
+       return start_transaction(root, num_items, TRANS_START, 1);
 }
+
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
 {
-       return start_transaction(root, 0, TRANS_JOIN);
+       return start_transaction(root, 0, TRANS_JOIN, 0);
 }
 
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
 {
-       return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
+       return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
 }
 
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
 {
-       return start_transaction(root, 0, TRANS_USERSPACE);
+       return start_transaction(root, 0, TRANS_USERSPACE, 0);
+}
+
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
+{
+       return start_transaction(root, 0, TRANS_ATTACH, 0);
 }
 
 /* wait for a transaction commit to be fully complete */
@@ -506,11 +547,12 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 }
 
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root, int throttle, int lock)
+                         struct btrfs_root *root, int throttle)
 {
        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_fs_info *info = root->fs_info;
        int count = 0;
+       int lock = (trans->type != TRANS_JOIN_NOLOCK);
        int err = 0;
 
        if (--trans->use_count) {
@@ -536,6 +578,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                trans->qgroup_reserved = 0;
        }
 
+       if (!list_empty(&trans->new_bgs))
+               btrfs_create_pending_block_groups(trans, root);
+
        while (count < 2) {
                unsigned long cur = trans->delayed_ref_updates;
                trans->delayed_ref_updates = 0;
@@ -551,7 +596,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
 
-       sb_end_intwrite(root->fs_info->sb);
+       if (!list_empty(&trans->new_bgs))
+               btrfs_create_pending_block_groups(trans, root);
 
        if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
            should_end_transaction(trans, root)) {
@@ -573,6 +619,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                }
        }
 
+       if (trans->type < TRANS_JOIN_NOLOCK)
+               sb_end_intwrite(root->fs_info->sb);
+
        WARN_ON(cur_trans != info->running_transaction);
        WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
        atomic_dec(&cur_trans->num_writers);
@@ -604,7 +653,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 {
        int ret;
 
-       ret = __btrfs_end_transaction(trans, root, 0, 1);
+       ret = __btrfs_end_transaction(trans, root, 0);
        if (ret)
                return ret;
        return 0;
@@ -615,18 +664,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 {
        int ret;
 
-       ret = __btrfs_end_transaction(trans, root, 1, 1);
-       if (ret)
-               return ret;
-       return 0;
-}
-
-int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root)
-{
-       int ret;
-
-       ret = __btrfs_end_transaction(trans, root, 0, 0);
+       ret = __btrfs_end_transaction(trans, root, 1);
        if (ret)
                return ret;
        return 0;
@@ -635,7 +673,7 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root)
 {
-       return __btrfs_end_transaction(trans, root, 1, 1);
+       return __btrfs_end_transaction(trans, root, 1);
 }
 
 /*
@@ -649,13 +687,15 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
        int err = 0;
        int werr = 0;
        struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+       struct extent_state *cached_state = NULL;
        u64 start = 0;
        u64 end;
 
        while (!find_first_extent_bit(dirty_pages, start, &start, &end,
-                                     mark)) {
-               convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
-                                  GFP_NOFS);
+                                     mark, &cached_state)) {
+               convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
+                                  mark, &cached_state, GFP_NOFS);
+               cached_state = NULL;
                err = filemap_fdatawrite_range(mapping, start, end);
                if (err)
                        werr = err;
@@ -679,12 +719,14 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
        int err = 0;
        int werr = 0;
        struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+       struct extent_state *cached_state = NULL;
        u64 start = 0;
        u64 end;
 
        while (!find_first_extent_bit(dirty_pages, start, &start, &end,
-                                     EXTENT_NEED_WAIT)) {
-               clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
+                                     EXTENT_NEED_WAIT, &cached_state)) {
+               clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
+                                0, 0, &cached_state, GFP_NOFS);
                err = filemap_fdatawait_range(mapping, start, end);
                if (err)
                        werr = err;
@@ -955,6 +997,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        struct btrfs_root *parent_root;
        struct btrfs_block_rsv *rsv;
        struct inode *parent_inode;
+       struct btrfs_path *path;
+       struct btrfs_dir_item *dir_item;
        struct dentry *parent;
        struct dentry *dentry;
        struct extent_buffer *tmp;
@@ -967,18 +1011,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        u64 root_flags;
        uuid_le new_uuid;
 
-       rsv = trans->block_rsv;
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = pending->error = -ENOMEM;
+               goto path_alloc_fail;
+       }
 
        new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
        if (!new_root_item) {
                ret = pending->error = -ENOMEM;
-               goto fail;
+               goto root_item_alloc_fail;
        }
 
        ret = btrfs_find_free_objectid(tree_root, &objectid);
        if (ret) {
                pending->error = ret;
-               goto fail;
+               goto no_free_objectid;
        }
 
        btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
@@ -988,22 +1036,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                                                  to_reserve);
                if (ret) {
                        pending->error = ret;
-                       goto fail;
+                       goto no_free_objectid;
                }
        }
 
        ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
                                   objectid, pending->inherit);
-       kfree(pending->inherit);
        if (ret) {
                pending->error = ret;
-               goto fail;
+               goto no_free_objectid;
        }
 
        key.objectid = objectid;
        key.offset = (u64)-1;
        key.type = BTRFS_ROOT_ITEM_KEY;
 
+       rsv = trans->block_rsv;
        trans->block_rsv = &pending->block_rsv;
 
        dentry = pending->dentry;
@@ -1017,24 +1065,21 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
         */
        ret = btrfs_set_inode_index(parent_inode, &index);
        BUG_ON(ret); /* -ENOMEM */
-       ret = btrfs_insert_dir_item(trans, parent_root,
-                               dentry->d_name.name, dentry->d_name.len,
-                               parent_inode, &key,
-                               BTRFS_FT_DIR, index);
-       if (ret == -EEXIST) {
+
+       /* check if there is a file/dir which has the same name. */
+       dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
+                                        btrfs_ino(parent_inode),
+                                        dentry->d_name.name,
+                                        dentry->d_name.len, 0);
+       if (dir_item != NULL && !IS_ERR(dir_item)) {
                pending->error = -EEXIST;
-               dput(parent);
                goto fail;
-       } else if (ret) {
-               goto abort_trans_dput;
+       } else if (IS_ERR(dir_item)) {
+               ret = PTR_ERR(dir_item);
+               btrfs_abort_transaction(trans, root, ret);
+               goto fail;
        }
-
-       btrfs_i_size_write(parent_inode, parent_inode->i_size +
-                                        dentry->d_name.len * 2);
-       parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
-       ret = btrfs_update_inode(trans, parent_root, parent_inode);
-       if (ret)
-               goto abort_trans_dput;
+       btrfs_release_path(path);
 
        /*
         * pull in the delayed directory update
@@ -1043,8 +1088,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
         * snapshot
         */
        ret = btrfs_run_delayed_items(trans, root);
-       if (ret) { /* Transaction aborted */
-               dput(parent);
+       if (ret) {      /* Transaction aborted */
+               btrfs_abort_transaction(trans, root, ret);
                goto fail;
        }
 
@@ -1079,7 +1124,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        if (ret) {
                btrfs_tree_unlock(old);
                free_extent_buffer(old);
-               goto abort_trans_dput;
+               btrfs_abort_transaction(trans, root, ret);
+               goto fail;
        }
 
        btrfs_set_lock_blocking(old);
@@ -1088,8 +1134,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        /* clean up in any case */
        btrfs_tree_unlock(old);
        free_extent_buffer(old);
-       if (ret)
-               goto abort_trans_dput;
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto fail;
+       }
 
        /* see comments in should_cow_block() */
        root->force_cow = 1;
@@ -1101,8 +1149,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
        btrfs_tree_unlock(tmp);
        free_extent_buffer(tmp);
-       if (ret)
-               goto abort_trans_dput;
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto fail;
+       }
 
        /*
         * insert root back/forward references
@@ -1111,32 +1161,58 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                                 parent_root->root_key.objectid,
                                 btrfs_ino(parent_inode), index,
                                 dentry->d_name.name, dentry->d_name.len);
-       dput(parent);
-       if (ret)
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
                goto fail;
+       }
 
        key.offset = (u64)-1;
        pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
        if (IS_ERR(pending->snap)) {
                ret = PTR_ERR(pending->snap);
-               goto abort_trans;
+               btrfs_abort_transaction(trans, root, ret);
+               goto fail;
        }
 
        ret = btrfs_reloc_post_snapshot(trans, pending);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto fail;
+       }
+
+       ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto fail;
+       }
+
+       ret = btrfs_insert_dir_item(trans, parent_root,
+                                   dentry->d_name.name, dentry->d_name.len,
+                                   parent_inode, &key,
+                                   BTRFS_FT_DIR, index);
+       /* We have check then name at the beginning, so it is impossible. */
+       BUG_ON(ret == -EEXIST);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto fail;
+       }
+
+       btrfs_i_size_write(parent_inode, parent_inode->i_size +
+                                        dentry->d_name.len * 2);
+       parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+       ret = btrfs_update_inode(trans, parent_root, parent_inode);
        if (ret)
-               goto abort_trans;
-       ret = 0;
+               btrfs_abort_transaction(trans, root, ret);
 fail:
-       kfree(new_root_item);
+       dput(parent);
        trans->block_rsv = rsv;
+no_free_objectid:
+       kfree(new_root_item);
+root_item_alloc_fail:
+       btrfs_free_path(path);
+path_alloc_fail:
        btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
        return ret;
-
-abort_trans_dput:
-       dput(parent);
-abort_trans:
-       btrfs_abort_transaction(trans, root, ret);
-       goto fail;
 }
 
 /*
@@ -1229,6 +1305,16 @@ static void do_async_commit(struct work_struct *work)
        struct btrfs_async_commit *ac =
                container_of(work, struct btrfs_async_commit, work.work);
 
+       /*
+        * We've got freeze protection passed with the transaction.
+        * Tell lockdep about it.
+        */
+       rwsem_acquire_read(
+               &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+               0, 1, _THIS_IP_);
+
+       current->journal_info = ac->newtrans;
+
        btrfs_commit_transaction(ac->newtrans, ac->root);
        kfree(ac);
 }
@@ -1258,6 +1344,14 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
        atomic_inc(&cur_trans->use_count);
 
        btrfs_end_transaction(trans, root);
+
+       /*
+        * Tell lockdep we've released the freeze rwsem, since the
+        * async commit thread will be the one to unlock it.
+        */
+       rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                     1, _THIS_IP_);
+
        schedule_delayed_work(&ac->work, 0);
 
        /* wait for transaction to start and unblock */
@@ -1348,6 +1442,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         */
        cur_trans->delayed_refs.flushing = 1;
 
+       if (!list_empty(&trans->new_bgs))
+               btrfs_create_pending_block_groups(trans, root);
+
        ret = btrfs_run_delayed_refs(trans, root, 0);
        if (ret)
                goto cleanup_transaction;
@@ -1403,7 +1500,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
                if (flush_on_commit || snap_pending) {
                        btrfs_start_delalloc_inodes(root, 1);
-                       btrfs_wait_ordered_extents(root, 0, 1);
+                       btrfs_wait_ordered_extents(root, 1);
                }
 
                ret = btrfs_run_delayed_items(trans, root);
@@ -1456,13 +1553,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         */
        mutex_lock(&root->fs_info->reloc_mutex);
 
-       ret = btrfs_run_delayed_items(trans, root);
+       /*
+        * We needn't worry about the delayed items because we will
+        * deal with them in create_pending_snapshot(), which is the
+        * core function of the snapshot creation.
+        */
+       ret = create_pending_snapshots(trans, root->fs_info);
        if (ret) {
                mutex_unlock(&root->fs_info->reloc_mutex);
                goto cleanup_transaction;
        }
 
-       ret = create_pending_snapshots(trans, root->fs_info);
+       /*
+        * We insert the dir indexes of the snapshots and update the inode
+        * of the snapshots' parents after the snapshot creation, so there
+        * are some delayed items which are not dealt with. Now deal with
+        * them.
+        *
+        * We needn't worry that this operation will corrupt the snapshots,
+        * because all the tree which are snapshoted will be forced to COW
+        * the nodes and leaves.
+        */
+       ret = btrfs_run_delayed_items(trans, root);
        if (ret) {
                mutex_unlock(&root->fs_info->reloc_mutex);
                goto cleanup_transaction;
@@ -1584,7 +1696,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        put_transaction(cur_trans);
        put_transaction(cur_trans);
 
-       sb_end_intwrite(root->fs_info->sb);
+       if (trans->type < TRANS_JOIN_NOLOCK)
+               sb_end_intwrite(root->fs_info->sb);
 
        trace_btrfs_transaction_commit(root);
 
index e8b8416c688b2a7b5d6c4f79c844bb07953cb3b5..80961947a6b27df59273f080f9f19316c8d98ffc 100644 (file)
@@ -47,6 +47,14 @@ struct btrfs_transaction {
        int aborted;
 };
 
+enum btrfs_trans_type {
+       TRANS_START,
+       TRANS_JOIN,
+       TRANS_USERSPACE,
+       TRANS_JOIN_NOLOCK,
+       TRANS_ATTACH,
+};
+
 struct btrfs_trans_handle {
        u64 transid;
        u64 bytes_reserved;
@@ -58,8 +66,9 @@ struct btrfs_trans_handle {
        struct btrfs_transaction *transaction;
        struct btrfs_block_rsv *block_rsv;
        struct btrfs_block_rsv *orig_rsv;
-       int aborted;
-       int adding_csums;
+       short aborted;
+       short adding_csums;
+       enum btrfs_trans_type type;
        /*
         * this root is only needed to validate that the root passed to
         * start_transaction is the same as the one passed to end_transaction.
@@ -68,6 +77,7 @@ struct btrfs_trans_handle {
        struct btrfs_root *root;
        struct seq_list delayed_ref_elem;
        struct list_head qgroup_ref_list;
+       struct list_head new_bgs;
 };
 
 struct btrfs_pending_snapshot {
@@ -88,16 +98,18 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 {
        BTRFS_I(inode)->last_trans = trans->transaction->transid;
        BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+       BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
 }
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root);
-int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                   int num_items);
+struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+                                       struct btrfs_root *root, int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
index c86670f4f2855ef3ee2f8252c54d5bd4c37f1ec6..e9ebb472b28ba6d5bb9e077e8b1a2f877797adb0 100644 (file)
 
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/list_sort.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
 #include "locking.h"
 #include "print-tree.h"
+#include "backref.h"
 #include "compat.h"
 #include "tree-log.h"
+#include "hash.h"
 
 /* magic values for the inode_only field in btrfs_log_inode:
  *
@@ -146,7 +149,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
                        root->log_multiple_pids = true;
                }
 
-               root->log_batch++;
+               atomic_inc(&root->log_batch);
                atomic_inc(&root->log_writers);
                mutex_unlock(&root->log_mutex);
                return 0;
@@ -165,7 +168,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
                        err = ret;
        }
        mutex_unlock(&root->fs_info->tree_log_mutex);
-       root->log_batch++;
+       atomic_inc(&root->log_batch);
        atomic_inc(&root->log_writers);
        mutex_unlock(&root->log_mutex);
        return err;
@@ -484,7 +487,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
        int found_type;
        u64 mask = root->sectorsize - 1;
        u64 extent_end;
-       u64 alloc_hint;
        u64 start = key->offset;
        u64 saved_nbytes;
        struct btrfs_file_extent_item *item;
@@ -550,8 +552,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 
        saved_nbytes = inode_get_bytes(inode);
        /* drop any overlapping extents */
-       ret = btrfs_drop_extents(trans, inode, start, extent_end,
-                                &alloc_hint, 1);
+       ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
        BUG_ON(ret);
 
        if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -744,6 +745,7 @@ out:
  */
 static noinline int backref_in_log(struct btrfs_root *log,
                                   struct btrfs_key *key,
+                                  u64 ref_objectid,
                                   char *name, int namelen)
 {
        struct btrfs_path *path;
@@ -764,8 +766,17 @@ static noinline int backref_in_log(struct btrfs_root *log,
        if (ret != 0)
                goto out;
 
-       item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
        ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+
+       if (key->type == BTRFS_INODE_EXTREF_KEY) {
+               if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+                                                  name, namelen, NULL))
+                       match = 1;
+
+               goto out;
+       }
+
+       item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
        ptr_end = ptr + item_size;
        while (ptr < ptr_end) {
                ref = (struct btrfs_inode_ref *)ptr;
@@ -786,91 +797,42 @@ out:
        return match;
 }
 
-
-/*
- * replay one inode back reference item found in the log tree.
- * eb, slot and key refer to the buffer and key found in the log tree.
- * root is the destination we are replaying into, and path is for temp
- * use by this function.  (it should be released on return).
- */
-static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
-                                 struct btrfs_root *log,
                                  struct btrfs_path *path,
-                                 struct extent_buffer *eb, int slot,
-                                 struct btrfs_key *key)
+                                 struct btrfs_root *log_root,
+                                 struct inode *dir, struct inode *inode,
+                                 struct extent_buffer *eb,
+                                 u64 inode_objectid, u64 parent_objectid,
+                                 u64 ref_index, char *name, int namelen,
+                                 int *search_done)
 {
-       struct btrfs_inode_ref *ref;
-       struct btrfs_dir_item *di;
-       struct inode *dir;
-       struct inode *inode;
-       unsigned long ref_ptr;
-       unsigned long ref_end;
-       char *name;
-       int namelen;
        int ret;
-       int search_done = 0;
-
-       /*
-        * it is possible that we didn't log all the parent directories
-        * for a given inode.  If we don't find the dir, just don't
-        * copy the back ref in.  The link count fixup code will take
-        * care of the rest
-        */
-       dir = read_one_inode(root, key->offset);
-       if (!dir)
-               return -ENOENT;
-
-       inode = read_one_inode(root, key->objectid);
-       if (!inode) {
-               iput(dir);
-               return -EIO;
-       }
-
-       ref_ptr = btrfs_item_ptr_offset(eb, slot);
-       ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+       char *victim_name;
+       int victim_name_len;
+       struct extent_buffer *leaf;
+       struct btrfs_dir_item *di;
+       struct btrfs_key search_key;
+       struct btrfs_inode_extref *extref;
 
 again:
-       ref = (struct btrfs_inode_ref *)ref_ptr;
-
-       namelen = btrfs_inode_ref_name_len(eb, ref);
-       name = kmalloc(namelen, GFP_NOFS);
-       BUG_ON(!name);
-
-       read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
-
-       /* if we already have a perfect match, we're done */
-       if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
-                        btrfs_inode_ref_index(eb, ref),
-                        name, namelen)) {
-               goto out;
-       }
-
-       /*
-        * look for a conflicting back reference in the metadata.
-        * if we find one we have to unlink that name of the file
-        * before we add our new link.  Later on, we overwrite any
-        * existing back reference, and we don't want to create
-        * dangling pointers in the directory.
-        */
-
-       if (search_done)
-               goto insert;
-
-       ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+       /* Search old style refs */
+       search_key.objectid = inode_objectid;
+       search_key.type = BTRFS_INODE_REF_KEY;
+       search_key.offset = parent_objectid;
+       ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
        if (ret == 0) {
-               char *victim_name;
-               int victim_name_len;
                struct btrfs_inode_ref *victim_ref;
                unsigned long ptr;
                unsigned long ptr_end;
-               struct extent_buffer *leaf = path->nodes[0];
+
+               leaf = path->nodes[0];
 
                /* are we trying to overwrite a back ref for the root directory
                 * if so, just jump out, we're done
                 */
-               if (key->objectid == key->offset)
-                       goto out_nowrite;
+               if (search_key.objectid == search_key.offset)
+                       return 1;
 
                /* check all the names in this back reference to see
                 * if they are in the log.  if so, we allow them to stay
@@ -889,7 +851,9 @@ again:
                                           (unsigned long)(victim_ref + 1),
                                           victim_name_len);
 
-                       if (!backref_in_log(log, key, victim_name,
+                       if (!backref_in_log(log_root, &search_key,
+                                           parent_objectid,
+                                           victim_name,
                                            victim_name_len)) {
                                btrfs_inc_nlink(inode);
                                btrfs_release_path(path);
@@ -897,9 +861,14 @@ again:
                                ret = btrfs_unlink_inode(trans, root, dir,
                                                         inode, victim_name,
                                                         victim_name_len);
+                               BUG_ON(ret);
                                btrfs_run_delayed_items(trans, root);
+                               kfree(victim_name);
+                               *search_done = 1;
+                               goto again;
                        }
                        kfree(victim_name);
+
                        ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
                }
                BUG_ON(ret);
@@ -908,14 +877,78 @@ again:
                 * NOTE: we have searched root tree and checked the
                 * coresponding ref, it does not need to check again.
                 */
-               search_done = 1;
+               *search_done = 1;
+       }
+       btrfs_release_path(path);
+
+       /* Same search but for extended refs */
+       extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
+                                          inode_objectid, parent_objectid, 0,
+                                          0);
+       if (!IS_ERR_OR_NULL(extref)) {
+               u32 item_size;
+               u32 cur_offset = 0;
+               unsigned long base;
+               struct inode *victim_parent;
+
+               leaf = path->nodes[0];
+
+               item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+               base = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+               while (cur_offset < item_size) {
+                       extref = (struct btrfs_inode_extref *)base + cur_offset;
+
+                       victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+                       if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
+                               goto next;
+
+                       victim_name = kmalloc(victim_name_len, GFP_NOFS);
+                       read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
+                                          victim_name_len);
+
+                       search_key.objectid = inode_objectid;
+                       search_key.type = BTRFS_INODE_EXTREF_KEY;
+                       search_key.offset = btrfs_extref_hash(parent_objectid,
+                                                             victim_name,
+                                                             victim_name_len);
+                       ret = 0;
+                       if (!backref_in_log(log_root, &search_key,
+                                           parent_objectid, victim_name,
+                                           victim_name_len)) {
+                               ret = -ENOENT;
+                               victim_parent = read_one_inode(root,
+                                                              parent_objectid);
+                               if (victim_parent) {
+                                       btrfs_inc_nlink(inode);
+                                       btrfs_release_path(path);
+
+                                       ret = btrfs_unlink_inode(trans, root,
+                                                                victim_parent,
+                                                                inode,
+                                                                victim_name,
+                                                                victim_name_len);
+                                       btrfs_run_delayed_items(trans, root);
+                               }
+                               BUG_ON(ret);
+                               iput(victim_parent);
+                               kfree(victim_name);
+                               *search_done = 1;
+                               goto again;
+                       }
+                       kfree(victim_name);
+                       BUG_ON(ret);
+next:
+                       cur_offset += victim_name_len + sizeof(*extref);
+               }
+               *search_done = 1;
        }
        btrfs_release_path(path);
 
        /* look for a conflicting sequence number */
        di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
-                                        btrfs_inode_ref_index(eb, ref),
-                                        name, namelen, 0);
+                                        ref_index, name, namelen, 0);
        if (di && !IS_ERR(di)) {
                ret = drop_one_dir_item(trans, root, path, dir, di);
                BUG_ON(ret);
@@ -931,25 +964,173 @@ again:
        }
        btrfs_release_path(path);
 
-insert:
-       /* insert our name */
-       ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
-                            btrfs_inode_ref_index(eb, ref));
-       BUG_ON(ret);
+       return 0;
+}
 
-       btrfs_update_inode(trans, root, inode);
+static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+                            u32 *namelen, char **name, u64 *index,
+                            u64 *parent_objectid)
+{
+       struct btrfs_inode_extref *extref;
 
-out:
-       ref_ptr = (unsigned long)(ref + 1) + namelen;
-       kfree(name);
-       if (ref_ptr < ref_end)
-               goto again;
+       extref = (struct btrfs_inode_extref *)ref_ptr;
+
+       *namelen = btrfs_inode_extref_name_len(eb, extref);
+       *name = kmalloc(*namelen, GFP_NOFS);
+       if (*name == NULL)
+               return -ENOMEM;
+
+       read_extent_buffer(eb, *name, (unsigned long)&extref->name,
+                          *namelen);
+
+       *index = btrfs_inode_extref_index(eb, extref);
+       if (parent_objectid)
+               *parent_objectid = btrfs_inode_extref_parent(eb, extref);
+
+       return 0;
+}
+
+static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+                         u32 *namelen, char **name, u64 *index)
+{
+       struct btrfs_inode_ref *ref;
+
+       ref = (struct btrfs_inode_ref *)ref_ptr;
+
+       *namelen = btrfs_inode_ref_name_len(eb, ref);
+       *name = kmalloc(*namelen, GFP_NOFS);
+       if (*name == NULL)
+               return -ENOMEM;
+
+       read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
+
+       *index = btrfs_inode_ref_index(eb, ref);
+
+       return 0;
+}
+
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function.  (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_root *log,
+                                 struct btrfs_path *path,
+                                 struct extent_buffer *eb, int slot,
+                                 struct btrfs_key *key)
+{
+       struct inode *dir;
+       struct inode *inode;
+       unsigned long ref_ptr;
+       unsigned long ref_end;
+       char *name;
+       int namelen;
+       int ret;
+       int search_done = 0;
+       int log_ref_ver = 0;
+       u64 parent_objectid;
+       u64 inode_objectid;
+       u64 ref_index = 0;
+       int ref_struct_size;
+
+       ref_ptr = btrfs_item_ptr_offset(eb, slot);
+       ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+
+       if (key->type == BTRFS_INODE_EXTREF_KEY) {
+               struct btrfs_inode_extref *r;
+
+               ref_struct_size = sizeof(struct btrfs_inode_extref);
+               log_ref_ver = 1;
+               r = (struct btrfs_inode_extref *)ref_ptr;
+               parent_objectid = btrfs_inode_extref_parent(eb, r);
+       } else {
+               ref_struct_size = sizeof(struct btrfs_inode_ref);
+               parent_objectid = key->offset;
+       }
+       inode_objectid = key->objectid;
+
+       /*
+        * it is possible that we didn't log all the parent directories
+        * for a given inode.  If we don't find the dir, just don't
+        * copy the back ref in.  The link count fixup code will take
+        * care of the rest
+        */
+       dir = read_one_inode(root, parent_objectid);
+       if (!dir)
+               return -ENOENT;
+
+       inode = read_one_inode(root, inode_objectid);
+       if (!inode) {
+               iput(dir);
+               return -EIO;
+       }
+
+       while (ref_ptr < ref_end) {
+               if (log_ref_ver) {
+                       ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+                                               &ref_index, &parent_objectid);
+                       /*
+                        * parent object can change from one array
+                        * item to another.
+                        */
+                       if (!dir)
+                               dir = read_one_inode(root, parent_objectid);
+                       if (!dir)
+                               return -ENOENT;
+               } else {
+                       ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
+                                            &ref_index);
+               }
+               if (ret)
+                       return ret;
+
+               /* if we already have a perfect match, we're done */
+               if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
+                                 ref_index, name, namelen)) {
+                       /*
+                        * look for a conflicting back reference in the
+                        * metadata. if we find one we have to unlink that name
+                        * of the file before we add our new link.  Later on, we
+                        * overwrite any existing back reference, and we don't
+                        * want to create dangling pointers in the directory.
+                        */
+
+                       if (!search_done) {
+                               ret = __add_inode_ref(trans, root, path, log,
+                                                     dir, inode, eb,
+                                                     inode_objectid,
+                                                     parent_objectid,
+                                                     ref_index, name, namelen,
+                                                     &search_done);
+                               if (ret == 1)
+                                       goto out;
+                               BUG_ON(ret);
+                       }
+
+                       /* insert our name */
+                       ret = btrfs_add_link(trans, dir, inode, name, namelen,
+                                            0, ref_index);
+                       BUG_ON(ret);
+
+                       btrfs_update_inode(trans, root, inode);
+               }
+
+               ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
+               kfree(name);
+               if (log_ref_ver) {
+                       iput(dir);
+                       dir = NULL;
+               }
+       }
 
        /* finally write the back reference in the inode */
        ret = overwrite_item(trans, root, path, eb, slot, key);
        BUG_ON(ret);
 
-out_nowrite:
+out:
        btrfs_release_path(path);
        iput(dir);
        iput(inode);
@@ -966,25 +1147,55 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans,
        return ret;
 }
 
+static int count_inode_extrefs(struct btrfs_root *root,
+                              struct inode *inode, struct btrfs_path *path)
+{
+       int ret = 0;
+       int name_len;
+       unsigned int nlink = 0;
+       u32 item_size;
+       u32 cur_offset = 0;
+       u64 inode_objectid = btrfs_ino(inode);
+       u64 offset = 0;
+       unsigned long ptr;
+       struct btrfs_inode_extref *extref;
+       struct extent_buffer *leaf;
 
-/*
- * There are a few corners where the link count of the file can't
- * be properly maintained during replay.  So, instead of adding
- * lots of complexity to the log code, we just scan the backrefs
- * for any file that has been through replay.
- *
- * The scan will update the link count on the inode to reflect the
- * number of back refs found.  If it goes down to zero, the iput
- * will free the inode.
- */
-static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
-                                          struct btrfs_root *root,
-                                          struct inode *inode)
+       while (1) {
+               ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
+                                           &extref, &offset);
+               if (ret)
+                       break;
+
+               leaf = path->nodes[0];
+               item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+               ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+               while (cur_offset < item_size) {
+                       extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+                       name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+                       nlink++;
+
+                       cur_offset += name_len + sizeof(*extref);
+               }
+
+               offset++;
+               btrfs_release_path(path);
+       }
+       btrfs_release_path(path);
+
+       if (ret < 0)
+               return ret;
+       return nlink;
+}
+
+static int count_inode_refs(struct btrfs_root *root,
+                              struct inode *inode, struct btrfs_path *path)
 {
-       struct btrfs_path *path;
        int ret;
        struct btrfs_key key;
-       u64 nlink = 0;
+       unsigned int nlink = 0;
        unsigned long ptr;
        unsigned long ptr_end;
        int name_len;
@@ -994,10 +1205,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
        key.type = BTRFS_INODE_REF_KEY;
        key.offset = (u64)-1;
 
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
@@ -1031,6 +1238,50 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
                btrfs_release_path(path);
        }
        btrfs_release_path(path);
+
+       return nlink;
+}
+
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay.  So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found.  If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct inode *inode)
+{
+       struct btrfs_path *path;
+       int ret;
+       u64 nlink = 0;
+       u64 ino = btrfs_ino(inode);
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       ret = count_inode_refs(root, inode, path);
+       if (ret < 0)
+               goto out;
+
+       nlink = ret;
+
+       ret = count_inode_extrefs(root, inode, path);
+       if (ret == -ENOENT)
+               ret = 0;
+
+       if (ret < 0)
+               goto out;
+
+       nlink += ret;
+
+       ret = 0;
+
        if (nlink != inode->i_nlink) {
                set_nlink(inode, nlink);
                btrfs_update_inode(trans, root, inode);
@@ -1046,9 +1297,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
                ret = insert_orphan_item(trans, root, ino);
                BUG_ON(ret);
        }
-       btrfs_free_path(path);
 
-       return 0;
+out:
+       btrfs_free_path(path);
+       return ret;
 }
 
 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
@@ -1695,6 +1947,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                        ret = add_inode_ref(wc->trans, root, log, path,
                                            eb, i, &key);
                        BUG_ON(ret && ret != -ENOENT);
+               } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
+                       ret = add_inode_ref(wc->trans, root, log, path,
+                                           eb, i, &key);
+                       BUG_ON(ret && ret != -ENOENT);
                } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
                        ret = replay_one_extent(wc->trans, root, path,
                                                eb, i, &key);
@@ -2037,7 +2293,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
                wait_log_commit(trans, root, root->log_transid - 1);
        while (1) {
-               unsigned long batch = root->log_batch;
+               int batch = atomic_read(&root->log_batch);
                /* when we're on an ssd, just kick the log commit out */
                if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
                        mutex_unlock(&root->log_mutex);
@@ -2045,7 +2301,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                        mutex_lock(&root->log_mutex);
                }
                wait_for_writer(trans, root);
-               if (batch == root->log_batch)
+               if (batch == atomic_read(&root->log_batch))
                        break;
        }
 
@@ -2074,7 +2330,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
        btrfs_set_root_node(&log->root_item, log->node);
 
-       root->log_batch = 0;
        root->log_transid++;
        log->log_transid = root->log_transid;
        root->log_start_pid = 0;
@@ -2087,7 +2342,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        mutex_unlock(&root->log_mutex);
 
        mutex_lock(&log_root_tree->log_mutex);
-       log_root_tree->log_batch++;
+       atomic_inc(&log_root_tree->log_batch);
        atomic_inc(&log_root_tree->log_writers);
        mutex_unlock(&log_root_tree->log_mutex);
 
@@ -2157,7 +2412,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
                                btrfs_header_level(log_root_tree->node));
 
-       log_root_tree->log_batch = 0;
        log_root_tree->log_transid++;
        smp_mb();
 
@@ -2171,9 +2425,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         * in and cause problems either.
         */
        btrfs_scrub_pause_super(root);
-       write_ctree_super(trans, root->fs_info->tree_root, 1);
+       ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
        btrfs_scrub_continue_super(root);
-       ret = 0;
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_wake_log_root;
+       }
 
        mutex_lock(&root->log_mutex);
        if (root->last_log_commit < log_transid)
@@ -2209,7 +2466,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
 
        while (1) {
                ret = find_first_extent_bit(&log->dirty_log_pages,
-                               0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
+                               0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
+                               NULL);
                if (ret)
                        break;
 
@@ -2646,6 +2904,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
        int ret;
        struct btrfs_key key;
        struct btrfs_key found_key;
+       int start_slot;
 
        key.objectid = objectid;
        key.type = max_key_type;
@@ -2667,8 +2926,18 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
                if (found_key.objectid != objectid)
                        break;
 
-               ret = btrfs_del_item(trans, log, path);
-               if (ret)
+               found_key.offset = 0;
+               found_key.type = 0;
+               ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
+                                      &start_slot);
+
+               ret = btrfs_del_items(trans, log, path, start_slot,
+                                     path->slots[0] - start_slot + 1);
+               /*
+                * If start slot isn't 0 then we don't need to re-search, we've
+                * found the last guy with the objectid in this tree.
+                */
+               if (ret || start_slot != 0)
                        break;
                btrfs_release_path(path);
        }
@@ -2678,14 +2947,64 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
        return ret;
 }
 
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+                           struct extent_buffer *leaf,
+                           struct btrfs_inode_item *item,
+                           struct inode *inode, int log_inode_only)
+{
+       btrfs_set_inode_uid(leaf, item, inode->i_uid);
+       btrfs_set_inode_gid(leaf, item, inode->i_gid);
+       btrfs_set_inode_mode(leaf, item, inode->i_mode);
+       btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+
+       btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
+                              inode->i_atime.tv_sec);
+       btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
+                               inode->i_atime.tv_nsec);
+
+       btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
+                              inode->i_mtime.tv_sec);
+       btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
+                               inode->i_mtime.tv_nsec);
+
+       btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
+                              inode->i_ctime.tv_sec);
+       btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
+                               inode->i_ctime.tv_nsec);
+
+       btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
+
+       btrfs_set_inode_sequence(leaf, item, inode->i_version);
+       btrfs_set_inode_transid(leaf, item, trans->transid);
+       btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
+       btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
+       btrfs_set_inode_block_group(leaf, item, 0);
+
+       if (log_inode_only) {
+               /* set the generation to zero so the recover code
+                * can tell the difference between an logging
+                * just to say 'this inode exists' and a logging
+                * to say 'update this inode with these values'
+                */
+               btrfs_set_inode_generation(leaf, item, 0);
+               btrfs_set_inode_size(leaf, item, 0);
+       } else {
+               btrfs_set_inode_generation(leaf, item,
+                                          BTRFS_I(inode)->generation);
+               btrfs_set_inode_size(leaf, item, inode->i_size);
+       }
+
+}
+
 static noinline int copy_items(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *log,
+                              struct inode *inode,
                               struct btrfs_path *dst_path,
                               struct extent_buffer *src,
                               int start_slot, int nr, int inode_only)
 {
        unsigned long src_offset;
        unsigned long dst_offset;
+       struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
        struct btrfs_file_extent_item *extent;
        struct btrfs_inode_item *inode_item;
        int ret;
@@ -2694,6 +3013,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        char *ins_data;
        int i;
        struct list_head ordered_sums;
+       int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
        INIT_LIST_HEAD(&ordered_sums);
 
@@ -2722,29 +3042,23 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 
                src_offset = btrfs_item_ptr_offset(src, start_slot + i);
 
-               copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
-                                  src_offset, ins_sizes[i]);
-
-               if (inode_only == LOG_INODE_EXISTS &&
-                   ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+               if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
                        inode_item = btrfs_item_ptr(dst_path->nodes[0],
                                                    dst_path->slots[0],
                                                    struct btrfs_inode_item);
-                       btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
-
-                       /* set the generation to zero so the recover code
-                        * can tell the difference between an logging
-                        * just to say 'this inode exists' and a logging
-                        * to say 'update this inode with these values'
-                        */
-                       btrfs_set_inode_generation(dst_path->nodes[0],
-                                                  inode_item, 0);
+                       fill_inode_item(trans, dst_path->nodes[0], inode_item,
+                                       inode, inode_only == LOG_INODE_EXISTS);
+               } else {
+                       copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+                                          src_offset, ins_sizes[i]);
                }
+
                /* take a reference on file data extents so that truncates
                 * or deletes of this inode don't have to relog the inode
                 * again
                 */
-               if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
+               if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
+                   !skip_csum) {
                        int found_type;
                        extent = btrfs_item_ptr(src, start_slot + i,
                                                struct btrfs_file_extent_item);
@@ -2753,8 +3067,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                                continue;
 
                        found_type = btrfs_file_extent_type(src, extent);
-                       if (found_type == BTRFS_FILE_EXTENT_REG ||
-                           found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                       if (found_type == BTRFS_FILE_EXTENT_REG) {
                                u64 ds, dl, cs, cl;
                                ds = btrfs_file_extent_disk_bytenr(src,
                                                                extent);
@@ -2803,6 +3116,239 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        return ret;
 }
 
+static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+       struct extent_map *em1, *em2;
+
+       em1 = list_entry(a, struct extent_map, list);
+       em2 = list_entry(b, struct extent_map, list);
+
+       if (em1->start < em2->start)
+               return -1;
+       else if (em1->start > em2->start)
+               return 1;
+       return 0;
+}
+
+struct log_args {
+       struct extent_buffer *src;
+       u64 next_offset;
+       int start_slot;
+       int nr;
+};
+
+static int log_one_extent(struct btrfs_trans_handle *trans,
+                         struct inode *inode, struct btrfs_root *root,
+                         struct extent_map *em, struct btrfs_path *path,
+                         struct btrfs_path *dst_path, struct log_args *args)
+{
+       struct btrfs_root *log = root->log_root;
+       struct btrfs_file_extent_item *fi;
+       struct btrfs_key key;
+       u64 start = em->mod_start;
+       u64 search_start = start;
+       u64 len = em->mod_len;
+       u64 num_bytes;
+       int nritems;
+       int ret;
+
+       if (BTRFS_I(inode)->logged_trans == trans->transid) {
+               ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
+                                          start + len, NULL, 0);
+               if (ret)
+                       return ret;
+       }
+
+       while (len) {
+               if (args->nr)
+                       goto next_slot;
+again:
+               key.objectid = btrfs_ino(inode);
+               key.type = BTRFS_EXTENT_DATA_KEY;
+               key.offset = search_start;
+
+               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+               if (ret < 0)
+                       return ret;
+
+               if (ret) {
+                       /*
+                        * A rare case were we can have an em for a section of a
+                        * larger extent so we need to make sure that this em
+                        * falls within the extent we've found.  If not we just
+                        * bail and go back to ye-olde way of doing things but
+                        * it happens often enough in testing that we need to do
+                        * this dance to make sure.
+                        */
+                       do {
+                               if (path->slots[0] == 0) {
+                                       btrfs_release_path(path);
+                                       if (search_start == 0)
+                                               return -ENOENT;
+                                       search_start--;
+                                       goto again;
+                               }
+
+                               path->slots[0]--;
+                               btrfs_item_key_to_cpu(path->nodes[0], &key,
+                                                     path->slots[0]);
+                               if (key.objectid != btrfs_ino(inode) ||
+                                   key.type != BTRFS_EXTENT_DATA_KEY) {
+                                       btrfs_release_path(path);
+                                       return -ENOENT;
+                               }
+                       } while (key.offset > start);
+
+                       fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                           struct btrfs_file_extent_item);
+                       num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
+                                                               fi);
+                       if (key.offset + num_bytes <= start) {
+                               btrfs_release_path(path);
+                               return -ENOENT;
+                       }
+               }
+               args->src = path->nodes[0];
+next_slot:
+               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+               fi = btrfs_item_ptr(args->src, path->slots[0],
+                                   struct btrfs_file_extent_item);
+               if (args->nr &&
+                   args->start_slot + args->nr == path->slots[0]) {
+                       args->nr++;
+               } else if (args->nr) {
+                       ret = copy_items(trans, inode, dst_path, args->src,
+                                        args->start_slot, args->nr,
+                                        LOG_INODE_ALL);
+                       if (ret)
+                               return ret;
+                       args->nr = 1;
+                       args->start_slot = path->slots[0];
+               } else if (!args->nr) {
+                       args->nr = 1;
+                       args->start_slot = path->slots[0];
+               }
+               nritems = btrfs_header_nritems(path->nodes[0]);
+               path->slots[0]++;
+               num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
+               if (len < num_bytes) {
+                       /* I _think_ this is ok, envision we write to a
+                        * preallocated space that is adjacent to a previously
+                        * written preallocated space that gets merged when we
+                        * mark this preallocated space written.  If we do not
+                        * have the adjacent extent in cache then when we copy
+                        * this extent it could end up being larger than our EM
+                        * thinks it is, which is a-ok, so just set len to 0.
+                        */
+                       len = 0;
+               } else {
+                       len -= num_bytes;
+               }
+               start = key.offset + num_bytes;
+               args->next_offset = start;
+               search_start = start;
+
+               if (path->slots[0] < nritems) {
+                       if (len)
+                               goto next_slot;
+                       break;
+               }
+
+               if (args->nr) {
+                       ret = copy_items(trans, inode, dst_path, args->src,
+                                        args->start_slot, args->nr,
+                                        LOG_INODE_ALL);
+                       if (ret)
+                               return ret;
+                       args->nr = 0;
+                       btrfs_release_path(path);
+               }
+       }
+
+       return 0;
+}
+
+static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct inode *inode,
+                                    struct btrfs_path *path,
+                                    struct btrfs_path *dst_path)
+{
+       struct log_args args;
+       struct extent_map *em, *n;
+       struct list_head extents;
+       struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+       u64 test_gen;
+       int ret = 0;
+
+       INIT_LIST_HEAD(&extents);
+
+       memset(&args, 0, sizeof(args));
+
+       write_lock(&tree->lock);
+       test_gen = root->fs_info->last_trans_committed;
+
+       list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
+               list_del_init(&em->list);
+               if (em->generation <= test_gen)
+                       continue;
+               /* Need a ref to keep it from getting evicted from cache */
+               atomic_inc(&em->refs);
+               set_bit(EXTENT_FLAG_LOGGING, &em->flags);
+               list_add_tail(&em->list, &extents);
+       }
+
+       list_sort(NULL, &extents, extent_cmp);
+
+       while (!list_empty(&extents)) {
+               em = list_entry(extents.next, struct extent_map, list);
+
+               list_del_init(&em->list);
+               clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+
+               /*
+                * If we had an error we just need to delete everybody from our
+                * private list.
+                */
+               if (ret) {
+                       free_extent_map(em);
+                       continue;
+               }
+
+               write_unlock(&tree->lock);
+
+               /*
+                * If the previous EM and the last extent we left off on aren't
+                * sequential then we need to copy the items we have and redo
+                * our search
+                */
+               if (args.nr && em->mod_start != args.next_offset) {
+                       ret = copy_items(trans, inode, dst_path, args.src,
+                                        args.start_slot, args.nr,
+                                        LOG_INODE_ALL);
+                       if (ret) {
+                               free_extent_map(em);
+                               write_lock(&tree->lock);
+                               continue;
+                       }
+                       btrfs_release_path(path);
+                       args.nr = 0;
+               }
+
+               ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
+               free_extent_map(em);
+               write_lock(&tree->lock);
+       }
+       WARN_ON(!list_empty(&extents));
+       write_unlock(&tree->lock);
+
+       if (!ret && args.nr)
+               ret = copy_items(trans, inode, dst_path, args.src,
+                                args.start_slot, args.nr, LOG_INODE_ALL);
+       btrfs_release_path(path);
+       return ret;
+}
+
 /* log a single inode in the tree log.
  * At least one parent directory for this inode must exist in the tree
  * or be logged already.
@@ -2832,6 +3378,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        int nritems;
        int ins_start_slot = 0;
        int ins_nr;
+       bool fast_search = false;
        u64 ino = btrfs_ino(inode);
 
        log = root->log_root;
@@ -2851,21 +3398,23 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
        max_key.objectid = ino;
 
-       /* today the code can only do partial logging of directories */
-       if (!S_ISDIR(inode->i_mode))
-           inode_only = LOG_INODE_ALL;
 
+       /* today the code can only do partial logging of directories */
        if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
                max_key.type = BTRFS_XATTR_ITEM_KEY;
        else
                max_key.type = (u8)-1;
        max_key.offset = (u64)-1;
 
-       ret = btrfs_commit_inode_delayed_items(trans, inode);
-       if (ret) {
-               btrfs_free_path(path);
-               btrfs_free_path(dst_path);
-               return ret;
+       /* Only run delayed items if we are a dir or a new file */
+       if (S_ISDIR(inode->i_mode) ||
+           BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
+               ret = btrfs_commit_inode_delayed_items(trans, inode);
+               if (ret) {
+                       btrfs_free_path(path);
+                       btrfs_free_path(dst_path);
+                       return ret;
+               }
        }
 
        mutex_lock(&BTRFS_I(inode)->log_mutex);
@@ -2881,7 +3430,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                        max_key_type = BTRFS_XATTR_ITEM_KEY;
                ret = drop_objectid_items(trans, log, path, ino, max_key_type);
        } else {
-               ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
+               if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                                      &BTRFS_I(inode)->runtime_flags)) {
+                       ret = btrfs_truncate_inode_items(trans, log,
+                                                        inode, 0, 0);
+               } else {
+                       fast_search = true;
+                       max_key.type = BTRFS_XATTR_ITEM_KEY;
+                       ret = drop_objectid_items(trans, log, path, ino,
+                                                 BTRFS_XATTR_ITEM_KEY);
+               }
        }
        if (ret) {
                err = ret;
@@ -2912,7 +3470,7 @@ again:
                        goto next_slot;
                }
 
-               ret = copy_items(trans, log, dst_path, src, ins_start_slot,
+               ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
                                 ins_nr, inode_only);
                if (ret) {
                        err = ret;
@@ -2930,7 +3488,7 @@ next_slot:
                        goto again;
                }
                if (ins_nr) {
-                       ret = copy_items(trans, log, dst_path, src,
+                       ret = copy_items(trans, inode, dst_path, src,
                                         ins_start_slot,
                                         ins_nr, inode_only);
                        if (ret) {
@@ -2951,8 +3509,7 @@ next_slot:
                        break;
        }
        if (ins_nr) {
-               ret = copy_items(trans, log, dst_path, src,
-                                ins_start_slot,
+               ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
                                 ins_nr, inode_only);
                if (ret) {
                        err = ret;
@@ -2960,7 +3517,24 @@ next_slot:
                }
                ins_nr = 0;
        }
-       WARN_ON(ins_nr);
+
+       if (fast_search) {
+               btrfs_release_path(path);
+               btrfs_release_path(dst_path);
+               ret = btrfs_log_changed_extents(trans, root, inode, path,
+                                               dst_path);
+               if (ret) {
+                       err = ret;
+                       goto out_unlock;
+               }
+       } else {
+               struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+               struct extent_map *em, *n;
+
+               list_for_each_entry_safe(em, n, &tree->modified_extents, list)
+                       list_del_init(&em->list);
+       }
+
        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
                btrfs_release_path(path);
                btrfs_release_path(dst_path);
@@ -2971,6 +3545,7 @@ next_slot:
                }
        }
        BTRFS_I(inode)->logged_trans = trans->transid;
+       BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
 out_unlock:
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
 
@@ -3138,7 +3713,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 end_trans:
        dput(old_parent);
        if (ret < 0) {
-               BUG_ON(ret != -ENOSPC);
+               WARN_ON(ret != -ENOSPC);
                root->fs_info->last_trans_log_full_commit = trans->transid;
                ret = 1;
        }
index ab942f46b3dd81e06348c4950901f3e4eef87016..99be4c138db6dac51fbcb7afefc6931ce3cf583d 100644 (file)
@@ -143,14 +143,13 @@ EXPORT_SYMBOL(ulist_free);
  * In case of allocation failure -ENOMEM is returned and the ulist stays
  * unaltered.
  */
-int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
-             gfp_t gfp_mask)
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
 {
        return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
 }
 
-int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
-                   unsigned long *old_aux, gfp_t gfp_mask)
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
+                   u64 *old_aux, gfp_t gfp_mask)
 {
        int i;
 
index 21bdc8ec813046ac56e3c7db0739bcdba7ac188a..21a1963439c3030c0146a1994f47a38fd43e0a95 100644 (file)
@@ -33,7 +33,7 @@ struct ulist_iterator {
  */
 struct ulist_node {
        u64 val;                /* value to store */
-       unsigned long aux;      /* auxiliary value saved along with the val */
+       u64 aux;                /* auxiliary value saved along with the val */
 };
 
 struct ulist {
@@ -65,10 +65,9 @@ void ulist_fini(struct ulist *ulist);
 void ulist_reinit(struct ulist *ulist);
 struct ulist *ulist_alloc(gfp_t gfp_mask);
 void ulist_free(struct ulist *ulist);
-int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
-             gfp_t gfp_mask);
-int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
-                   unsigned long *old_aux, gfp_t gfp_mask);
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
+                   u64 *old_aux, gfp_t gfp_mask);
 struct ulist_node *ulist_next(struct ulist *ulist,
                              struct ulist_iterator *uiter);
 
index 88b969aeeb71a53128ae941e569ad19f7b1038c3..029b903a4ae3797322e05090790b86c9e8596c43 100644 (file)
@@ -639,7 +639,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
                bdev = blkdev_get_by_path(device->name->str, flags, holder);
                if (IS_ERR(bdev)) {
-                       printk(KERN_INFO "open %s failed\n", device->name->str);
+                       printk(KERN_INFO "btrfs: open %s failed\n", device->name->str);
                        goto error;
                }
                filemap_write_and_wait(bdev->bd_inode->i_mapping);
@@ -1475,6 +1475,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                free_fs_devices(cur_devices);
        }
 
+       root->fs_info->num_tolerated_disk_barrier_failures =
+               btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
+
        /*
         * at this point, the device is zero sized.  We want to
         * remove it from the devices list and zero out the old super
@@ -1775,15 +1778,21 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
        if (seeding_dev) {
                ret = init_first_rw_device(trans, root, device);
-               if (ret)
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
                        goto error_trans;
+               }
                ret = btrfs_finish_sprout(trans, root);
-               if (ret)
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
                        goto error_trans;
+               }
        } else {
                ret = btrfs_add_device(trans, root, device);
-               if (ret)
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
                        goto error_trans;
+               }
        }
 
        /*
@@ -1793,6 +1802,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        btrfs_clear_space_info_full(root->fs_info);
 
        unlock_chunks(root);
+       root->fs_info->num_tolerated_disk_barrier_failures =
+               btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
        ret = btrfs_commit_transaction(trans, root);
 
        if (seeding_dev) {
@@ -1814,7 +1825,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
 error_trans:
        unlock_chunks(root);
-       btrfs_abort_transaction(trans, root, ret);
        btrfs_end_transaction(trans, root);
        rcu_string_free(device->name);
        kfree(device);
@@ -2804,6 +2814,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                }
        }
 
+       if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+               int num_tolerated_disk_barrier_failures;
+               u64 target = bctl->sys.target;
+
+               num_tolerated_disk_barrier_failures =
+                       btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+               if (num_tolerated_disk_barrier_failures > 0 &&
+                   (target &
+                    (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+                     BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
+                       num_tolerated_disk_barrier_failures = 0;
+               else if (num_tolerated_disk_barrier_failures > 1 &&
+                        (target &
+                         (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
+                       num_tolerated_disk_barrier_failures = 1;
+
+               fs_info->num_tolerated_disk_barrier_failures =
+                       num_tolerated_disk_barrier_failures;
+       }
+
        ret = insert_balance_item(fs_info->tree_root, bctl);
        if (ret && ret != -EEXIST)
                goto out;
@@ -2836,6 +2866,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                __cancel_balance(fs_info);
        }
 
+       if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+               fs_info->num_tolerated_disk_barrier_failures =
+                       btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+       }
+
        wake_up(&fs_info->balance_wait_q);
 
        return ret;
@@ -3608,12 +3643,16 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
        ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
                                  &sys_chunk_size, &sys_stripe_size,
                                  sys_chunk_offset, alloc_profile);
-       if (ret)
-               goto abort;
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out;
+       }
 
        ret = btrfs_add_device(trans, fs_info->chunk_root, device);
-       if (ret)
-               goto abort;
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out;
+       }
 
        /*
         * Modifying chunk tree needs allocating new blocks from both
@@ -3623,19 +3662,19 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
         */
        ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
                                   chunk_size, stripe_size);
-       if (ret)
-               goto abort;
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out;
+       }
 
        ret = __finish_chunk_alloc(trans, extent_root, sys_map,
                                   sys_chunk_offset, sys_chunk_size,
                                   sys_stripe_size);
        if (ret)
-               goto abort;
+               btrfs_abort_transaction(trans, root, ret);
 
-       return 0;
+out:
 
-abort:
-       btrfs_abort_transaction(trans, root, ret);
        return ret;
 }
 
@@ -3760,7 +3799,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
        read_unlock(&em_tree->lock);
 
        if (!em) {
-               printk(KERN_CRIT "unable to find logical %llu len %llu\n",
+               printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n",
                       (unsigned long long)logical,
                       (unsigned long long)*length);
                BUG();
@@ -4217,7 +4256,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
        total_devs = bbio->num_stripes;
        if (map_length < length) {
-               printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
+               printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
                       "len %llu\n", (unsigned long long)logical,
                       (unsigned long long)length,
                       (unsigned long long)map_length);
index 92c20654cc55b18d77630104eca7085ecde66793..9acb846c3e7f775e78684d2117c861c70b9890bc 100644 (file)
@@ -97,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws,
        *total_in = 0;
 
        if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
-               printk(KERN_WARNING "deflateInit failed\n");
+               printk(KERN_WARNING "btrfs: deflateInit failed\n");
                ret = -1;
                goto out;
        }
@@ -125,7 +125,7 @@ static int zlib_compress_pages(struct list_head *ws,
        while (workspace->def_strm.total_in < len) {
                ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
                if (ret != Z_OK) {
-                       printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+                       printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n",
                               ret);
                        zlib_deflateEnd(&workspace->def_strm);
                        ret = -1;
@@ -252,7 +252,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
        }
 
        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
-               printk(KERN_WARNING "inflateInit failed\n");
+               printk(KERN_WARNING "btrfs: inflateInit failed\n");
                return -1;
        }
        while (workspace->inf_strm.total_in < srclen) {
@@ -336,7 +336,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
        }
 
        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
-               printk(KERN_WARNING "inflateInit failed\n");
+               printk(KERN_WARNING "btrfs: inflateInit failed\n");
                return -1;
        }
 
index 91b91e8056737878e71897b9e9e96d000b722d52..54fab041b22ab7a5102879a1b82ac5b4f97966df 100644 (file)
@@ -445,6 +445,7 @@ TRACE_EVENT(btrfs_delayed_tree_ref,
                __field(        u64,  ref_root          )
                __field(        int,  level             )
                __field(        int,  type              )
+               __field(        u64,  seq               )
        ),
 
        TP_fast_assign(
@@ -455,17 +456,19 @@ TRACE_EVENT(btrfs_delayed_tree_ref,
                __entry->ref_root       = full_ref->root;
                __entry->level          = full_ref->level;
                __entry->type           = ref->type;
+               __entry->seq            = ref->seq;
        ),
 
        TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, "
                  "parent = %llu(%s), ref_root = %llu(%s), level = %d, "
-                 "type = %s",
+                 "type = %s, seq = %llu",
                  (unsigned long long)__entry->bytenr,
                  (unsigned long long)__entry->num_bytes,
                  show_ref_action(__entry->action),
                  show_root_type(__entry->parent),
                  show_root_type(__entry->ref_root),
-                 __entry->level, show_ref_type(__entry->type))
+                 __entry->level, show_ref_type(__entry->type),
+                 (unsigned long long)__entry->seq)
 );
 
 TRACE_EVENT(btrfs_delayed_data_ref,
@@ -485,6 +488,7 @@ TRACE_EVENT(btrfs_delayed_data_ref,
                __field(        u64,  owner             )
                __field(        u64,  offset            )
                __field(        int,  type              )
+               __field(        u64,  seq               )
        ),
 
        TP_fast_assign(
@@ -496,11 +500,12 @@ TRACE_EVENT(btrfs_delayed_data_ref,
                __entry->owner          = full_ref->objectid;
                __entry->offset         = full_ref->offset;
                __entry->type           = ref->type;
+               __entry->seq            = ref->seq;
        ),
 
        TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, "
                  "parent = %llu(%s), ref_root = %llu(%s), owner = %llu, "
-                 "offset = %llu, type = %s",
+                 "offset = %llu, type = %s, seq = %llu",
                  (unsigned long long)__entry->bytenr,
                  (unsigned long long)__entry->num_bytes,
                  show_ref_action(__entry->action),
@@ -508,7 +513,8 @@ TRACE_EVENT(btrfs_delayed_data_ref,
                  show_root_type(__entry->ref_root),
                  (unsigned long long)__entry->owner,
                  (unsigned long long)__entry->offset,
-                 show_ref_type(__entry->type))
+                 show_ref_type(__entry->type),
+                 (unsigned long long)__entry->seq)
 );
 
 TRACE_EVENT(btrfs_delayed_ref_head,