]> Pileus Git - ~andy/linux/blob - fs/btrfs/ctree.c
Btrfs: Use map_private_extent_buffer during generic_bin_search
[~andy/linux] / fs / btrfs / ctree.c
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include "ctree.h"
21 #include "disk-io.h"
22 #include "transaction.h"
23 #include "print-tree.h"
24 #include "locking.h"
25
26 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
27                       *root, struct btrfs_path *path, int level);
28 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29                       *root, struct btrfs_key *ins_key,
30                       struct btrfs_path *path, int data_size, int extend);
31 static int push_node_left(struct btrfs_trans_handle *trans,
32                           struct btrfs_root *root, struct extent_buffer *dst,
33                           struct extent_buffer *src, int empty);
34 static int balance_node_right(struct btrfs_trans_handle *trans,
35                               struct btrfs_root *root,
36                               struct extent_buffer *dst_buf,
37                               struct extent_buffer *src_buf);
38 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39                    struct btrfs_path *path, int level, int slot);
40
41 inline void btrfs_init_path(struct btrfs_path *p)
42 {
43         memset(p, 0, sizeof(*p));
44 }
45
46 struct btrfs_path *btrfs_alloc_path(void)
47 {
48         struct btrfs_path *path;
49         path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
50         if (path) {
51                 btrfs_init_path(path);
52                 path->reada = 1;
53         }
54         return path;
55 }
56
57 /* this also releases the path */
58 void btrfs_free_path(struct btrfs_path *p)
59 {
60         btrfs_release_path(NULL, p);
61         kmem_cache_free(btrfs_path_cachep, p);
62 }
63
64 /*
65  * path release drops references on the extent buffers in the path
66  * and it drops any locks held by this path
67  *
68  * It is safe to call this on paths that no locks or extent buffers held.
69  */
70 void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
71 {
72         int i;
73
74         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
75                 p->slots[i] = 0;
76                 if (!p->nodes[i])
77                         continue;
78                 if (p->locks[i]) {
79                         btrfs_tree_unlock(p->nodes[i]);
80                         p->locks[i] = 0;
81                 }
82                 free_extent_buffer(p->nodes[i]);
83                 p->nodes[i] = NULL;
84         }
85 }
86
87 /*
88  * safely gets a reference on the root node of a tree.  A lock
89  * is not taken, so a concurrent writer may put a different node
90  * at the root of the tree.  See btrfs_lock_root_node for the
91  * looping required.
92  *
93  * The extent buffer returned by this has a reference taken, so
94  * it won't disappear.  It may stop being the root of the tree
95  * at any time because there are no locks held.
96  */
97 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
98 {
99         struct extent_buffer *eb;
100         spin_lock(&root->node_lock);
101         eb = root->node;
102         extent_buffer_get(eb);
103         spin_unlock(&root->node_lock);
104         return eb;
105 }
106
107 /* loop around taking references on and locking the root node of the
108  * tree until you end up with a lock on the root.  A locked buffer
109  * is returned, with a reference held.
110  */
111 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
112 {
113         struct extent_buffer *eb;
114
115         while(1) {
116                 eb = btrfs_root_node(root);
117                 btrfs_tree_lock(eb);
118
119                 spin_lock(&root->node_lock);
120                 if (eb == root->node) {
121                         spin_unlock(&root->node_lock);
122                         break;
123                 }
124                 spin_unlock(&root->node_lock);
125
126                 btrfs_tree_unlock(eb);
127                 free_extent_buffer(eb);
128         }
129         return eb;
130 }
131
132 /* cowonly root (everything not a reference counted cow subvolume), just get
133  * put onto a simple dirty list.  transaction.c walks this to make sure they
134  * get properly updated on disk.
135  */
136 static void add_root_to_dirty_list(struct btrfs_root *root)
137 {
138         if (root->track_dirty && list_empty(&root->dirty_list)) {
139                 list_add(&root->dirty_list,
140                          &root->fs_info->dirty_cowonly_roots);
141         }
142 }
143
144 /*
145  * used by snapshot creation to make a copy of a root for a tree with
146  * a given objectid.  The buffer with the new root node is returned in
147  * cow_ret, and this func returns zero on success or a negative error code.
148  */
149 int btrfs_copy_root(struct btrfs_trans_handle *trans,
150                       struct btrfs_root *root,
151                       struct extent_buffer *buf,
152                       struct extent_buffer **cow_ret, u64 new_root_objectid)
153 {
154         struct extent_buffer *cow;
155         u32 nritems;
156         int ret = 0;
157         int level;
158         struct btrfs_root *new_root;
159
160         new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
161         if (!new_root)
162                 return -ENOMEM;
163
164         memcpy(new_root, root, sizeof(*new_root));
165         new_root->root_key.objectid = new_root_objectid;
166
167         WARN_ON(root->ref_cows && trans->transid !=
168                 root->fs_info->running_transaction->transid);
169         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
170
171         level = btrfs_header_level(buf);
172         nritems = btrfs_header_nritems(buf);
173
174         cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
175                                      new_root_objectid, trans->transid,
176                                      level, buf->start, 0);
177         if (IS_ERR(cow)) {
178                 kfree(new_root);
179                 return PTR_ERR(cow);
180         }
181
182         copy_extent_buffer(cow, buf, 0, 0, cow->len);
183         btrfs_set_header_bytenr(cow, cow->start);
184         btrfs_set_header_generation(cow, trans->transid);
185         btrfs_set_header_owner(cow, new_root_objectid);
186         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
187
188         write_extent_buffer(cow, root->fs_info->fsid,
189                             (unsigned long)btrfs_header_fsid(cow),
190                             BTRFS_FSID_SIZE);
191
192         WARN_ON(btrfs_header_generation(buf) > trans->transid);
193         ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
194         kfree(new_root);
195
196         if (ret)
197                 return ret;
198
199         btrfs_mark_buffer_dirty(cow);
200         *cow_ret = cow;
201         return 0;
202 }
203
204 /*
205  * does the dirty work in cow of a single block.  The parent block
206  * (if supplied) is updated to point to the new cow copy.  The new
207  * buffer is marked dirty and returned locked.  If you modify the block
208  * it needs to be marked dirty again.
209  *
210  * search_start -- an allocation hint for the new block
211  *
212  * empty_size -- a hint that you plan on doing more cow.  This is the size in bytes
213  * the allocator should try to find free next to the block it returns.  This is
214  * just a hint and may be ignored by the allocator.
215  *
216  * prealloc_dest -- if you have already reserved a destination for the cow,
217  * this uses that block instead of allocating a new one.  btrfs_alloc_reserved_extent
218  * is used to finish the allocation.
219  */
220 static int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
221                              struct btrfs_root *root,
222                              struct extent_buffer *buf,
223                              struct extent_buffer *parent, int parent_slot,
224                              struct extent_buffer **cow_ret,
225                              u64 search_start, u64 empty_size,
226                              u64 prealloc_dest)
227 {
228         u64 parent_start;
229         struct extent_buffer *cow;
230         u32 nritems;
231         int ret = 0;
232         int level;
233         int unlock_orig = 0;
234
235         if (*cow_ret == buf)
236                 unlock_orig = 1;
237
238         WARN_ON(!btrfs_tree_locked(buf));
239
240         if (parent)
241                 parent_start = parent->start;
242         else
243                 parent_start = 0;
244
245         WARN_ON(root->ref_cows && trans->transid !=
246                 root->fs_info->running_transaction->transid);
247         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
248
249         level = btrfs_header_level(buf);
250         nritems = btrfs_header_nritems(buf);
251
252         if (prealloc_dest) {
253                 struct btrfs_key ins;
254
255                 ins.objectid = prealloc_dest;
256                 ins.offset = buf->len;
257                 ins.type = BTRFS_EXTENT_ITEM_KEY;
258
259                 ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
260                                                   root->root_key.objectid,
261                                                   trans->transid, level, &ins);
262                 BUG_ON(ret);
263                 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
264                                             buf->len);
265         } else {
266                 cow = btrfs_alloc_free_block(trans, root, buf->len,
267                                              parent_start,
268                                              root->root_key.objectid,
269                                              trans->transid, level,
270                                              search_start, empty_size);
271         }
272         if (IS_ERR(cow))
273                 return PTR_ERR(cow);
274
275         copy_extent_buffer(cow, buf, 0, 0, cow->len);
276         btrfs_set_header_bytenr(cow, cow->start);
277         btrfs_set_header_generation(cow, trans->transid);
278         btrfs_set_header_owner(cow, root->root_key.objectid);
279         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
280
281         write_extent_buffer(cow, root->fs_info->fsid,
282                             (unsigned long)btrfs_header_fsid(cow),
283                             BTRFS_FSID_SIZE);
284
285         WARN_ON(btrfs_header_generation(buf) > trans->transid);
286         if (btrfs_header_generation(buf) != trans->transid) {
287                 u32 nr_extents;
288                 ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
289                 if (ret)
290                         return ret;
291
292                 ret = btrfs_cache_ref(trans, root, buf, nr_extents);
293                 WARN_ON(ret);
294         } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
295                 /*
296                  * There are only two places that can drop reference to
297                  * tree blocks owned by living reloc trees, one is here,
298                  * the other place is btrfs_drop_subtree. In both places,
299                  * we check reference count while tree block is locked.
300                  * Furthermore, if reference count is one, it won't get
301                  * increased by someone else.
302                  */
303                 u32 refs;
304                 ret = btrfs_lookup_extent_ref(trans, root, buf->start,
305                                               buf->len, &refs);
306                 BUG_ON(ret);
307                 if (refs == 1) {
308                         ret = btrfs_update_ref(trans, root, buf, cow,
309                                                0, nritems);
310                         clean_tree_block(trans, root, buf);
311                 } else {
312                         ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
313                 }
314                 BUG_ON(ret);
315         } else {
316                 ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
317                 if (ret)
318                         return ret;
319                 clean_tree_block(trans, root, buf);
320         }
321
322         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
323                 ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
324                 WARN_ON(ret);
325         }
326
327         if (buf == root->node) {
328                 WARN_ON(parent && parent != buf);
329
330                 spin_lock(&root->node_lock);
331                 root->node = cow;
332                 extent_buffer_get(cow);
333                 spin_unlock(&root->node_lock);
334
335                 if (buf != root->commit_root) {
336                         btrfs_free_extent(trans, root, buf->start,
337                                           buf->len, buf->start,
338                                           root->root_key.objectid,
339                                           btrfs_header_generation(buf),
340                                           level, 1);
341                 }
342                 free_extent_buffer(buf);
343                 add_root_to_dirty_list(root);
344         } else {
345                 btrfs_set_node_blockptr(parent, parent_slot,
346                                         cow->start);
347                 WARN_ON(trans->transid == 0);
348                 btrfs_set_node_ptr_generation(parent, parent_slot,
349                                               trans->transid);
350                 btrfs_mark_buffer_dirty(parent);
351                 WARN_ON(btrfs_header_generation(parent) != trans->transid);
352                 btrfs_free_extent(trans, root, buf->start, buf->len,
353                                   parent_start, btrfs_header_owner(parent),
354                                   btrfs_header_generation(parent), level, 1);
355         }
356         if (unlock_orig)
357                 btrfs_tree_unlock(buf);
358         free_extent_buffer(buf);
359         btrfs_mark_buffer_dirty(cow);
360         *cow_ret = cow;
361         return 0;
362 }
363
364 /*
365  * cows a single block, see __btrfs_cow_block for the real work.
366  * This version of it has extra checks so that a block isn't cow'd more than
367  * once per transaction, as long as it hasn't been written yet
368  */
369 int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
370                     struct btrfs_root *root, struct extent_buffer *buf,
371                     struct extent_buffer *parent, int parent_slot,
372                     struct extent_buffer **cow_ret, u64 prealloc_dest)
373 {
374         u64 search_start;
375         int ret;
376
377         if (trans->transaction != root->fs_info->running_transaction) {
378                 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
379                        root->fs_info->running_transaction->transid);
380                 WARN_ON(1);
381         }
382         if (trans->transid != root->fs_info->generation) {
383                 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
384                        root->fs_info->generation);
385                 WARN_ON(1);
386         }
387
388         spin_lock(&root->fs_info->hash_lock);
389         if (btrfs_header_generation(buf) == trans->transid &&
390             btrfs_header_owner(buf) == root->root_key.objectid &&
391             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
392                 *cow_ret = buf;
393                 spin_unlock(&root->fs_info->hash_lock);
394                 WARN_ON(prealloc_dest);
395                 return 0;
396         }
397         spin_unlock(&root->fs_info->hash_lock);
398         search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
399         ret = __btrfs_cow_block(trans, root, buf, parent,
400                                  parent_slot, cow_ret, search_start, 0,
401                                  prealloc_dest);
402         return ret;
403 }
404
405 /*
406  * helper function for defrag to decide if two blocks pointed to by a
407  * node are actually close by
408  */
409 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
410 {
411         if (blocknr < other && other - (blocknr + blocksize) < 32768)
412                 return 1;
413         if (blocknr > other && blocknr - (other + blocksize) < 32768)
414                 return 1;
415         return 0;
416 }
417
418 /*
419  * compare two keys in a memcmp fashion
420  */
421 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
422 {
423         struct btrfs_key k1;
424
425         btrfs_disk_key_to_cpu(&k1, disk);
426
427         if (k1.objectid > k2->objectid)
428                 return 1;
429         if (k1.objectid < k2->objectid)
430                 return -1;
431         if (k1.type > k2->type)
432                 return 1;
433         if (k1.type < k2->type)
434                 return -1;
435         if (k1.offset > k2->offset)
436                 return 1;
437         if (k1.offset < k2->offset)
438                 return -1;
439         return 0;
440 }
441
442 /*
443  * same as comp_keys only with two btrfs_key's
444  */
445 static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
446 {
447         if (k1->objectid > k2->objectid)
448                 return 1;
449         if (k1->objectid < k2->objectid)
450                 return -1;
451         if (k1->type > k2->type)
452                 return 1;
453         if (k1->type < k2->type)
454                 return -1;
455         if (k1->offset > k2->offset)
456                 return 1;
457         if (k1->offset < k2->offset)
458                 return -1;
459         return 0;
460 }
461
462 /*
463  * this is used by the defrag code to go through all the
464  * leaves pointed to by a node and reallocate them so that
465  * disk order is close to key order
466  */
467 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
468                        struct btrfs_root *root, struct extent_buffer *parent,
469                        int start_slot, int cache_only, u64 *last_ret,
470                        struct btrfs_key *progress)
471 {
472         struct extent_buffer *cur;
473         u64 blocknr;
474         u64 gen;
475         u64 search_start = *last_ret;
476         u64 last_block = 0;
477         u64 other;
478         u32 parent_nritems;
479         int end_slot;
480         int i;
481         int err = 0;
482         int parent_level;
483         int uptodate;
484         u32 blocksize;
485         int progress_passed = 0;
486         struct btrfs_disk_key disk_key;
487
488         parent_level = btrfs_header_level(parent);
489         if (cache_only && parent_level != 1)
490                 return 0;
491
492         if (trans->transaction != root->fs_info->running_transaction) {
493                 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
494                        root->fs_info->running_transaction->transid);
495                 WARN_ON(1);
496         }
497         if (trans->transid != root->fs_info->generation) {
498                 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
499                        root->fs_info->generation);
500                 WARN_ON(1);
501         }
502
503         parent_nritems = btrfs_header_nritems(parent);
504         blocksize = btrfs_level_size(root, parent_level - 1);
505         end_slot = parent_nritems;
506
507         if (parent_nritems == 1)
508                 return 0;
509
510         for (i = start_slot; i < end_slot; i++) {
511                 int close = 1;
512
513                 if (!parent->map_token) {
514                         map_extent_buffer(parent,
515                                         btrfs_node_key_ptr_offset(i),
516                                         sizeof(struct btrfs_key_ptr),
517                                         &parent->map_token, &parent->kaddr,
518                                         &parent->map_start, &parent->map_len,
519                                         KM_USER1);
520                 }
521                 btrfs_node_key(parent, &disk_key, i);
522                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
523                         continue;
524
525                 progress_passed = 1;
526                 blocknr = btrfs_node_blockptr(parent, i);
527                 gen = btrfs_node_ptr_generation(parent, i);
528                 if (last_block == 0)
529                         last_block = blocknr;
530
531                 if (i > 0) {
532                         other = btrfs_node_blockptr(parent, i - 1);
533                         close = close_blocks(blocknr, other, blocksize);
534                 }
535                 if (!close && i < end_slot - 2) {
536                         other = btrfs_node_blockptr(parent, i + 1);
537                         close = close_blocks(blocknr, other, blocksize);
538                 }
539                 if (close) {
540                         last_block = blocknr;
541                         continue;
542                 }
543                 if (parent->map_token) {
544                         unmap_extent_buffer(parent, parent->map_token,
545                                             KM_USER1);
546                         parent->map_token = NULL;
547                 }
548
549                 cur = btrfs_find_tree_block(root, blocknr, blocksize);
550                 if (cur)
551                         uptodate = btrfs_buffer_uptodate(cur, gen);
552                 else
553                         uptodate = 0;
554                 if (!cur || !uptodate) {
555                         if (cache_only) {
556                                 free_extent_buffer(cur);
557                                 continue;
558                         }
559                         if (!cur) {
560                                 cur = read_tree_block(root, blocknr,
561                                                          blocksize, gen);
562                         } else if (!uptodate) {
563                                 btrfs_read_buffer(cur, gen);
564                         }
565                 }
566                 if (search_start == 0)
567                         search_start = last_block;
568
569                 btrfs_tree_lock(cur);
570                 err = __btrfs_cow_block(trans, root, cur, parent, i,
571                                         &cur, search_start,
572                                         min(16 * blocksize,
573                                             (end_slot - i) * blocksize), 0);
574                 if (err) {
575                         btrfs_tree_unlock(cur);
576                         free_extent_buffer(cur);
577                         break;
578                 }
579                 search_start = cur->start;
580                 last_block = cur->start;
581                 *last_ret = search_start;
582                 btrfs_tree_unlock(cur);
583                 free_extent_buffer(cur);
584         }
585         if (parent->map_token) {
586                 unmap_extent_buffer(parent, parent->map_token,
587                                     KM_USER1);
588                 parent->map_token = NULL;
589         }
590         return err;
591 }
592
593 /*
594  * The leaf data grows from end-to-front in the node.
595  * this returns the address of the start of the last item,
596  * which is the stop of the leaf data stack
597  */
598 static inline unsigned int leaf_data_end(struct btrfs_root *root,
599                                          struct extent_buffer *leaf)
600 {
601         u32 nr = btrfs_header_nritems(leaf);
602         if (nr == 0)
603                 return BTRFS_LEAF_DATA_SIZE(root);
604         return btrfs_item_offset_nr(leaf, nr - 1);
605 }
606
607 /*
608  * extra debugging checks to make sure all the items in a key are
609  * well formed and in the proper order
610  */
611 static int check_node(struct btrfs_root *root, struct btrfs_path *path,
612                       int level)
613 {
614         struct extent_buffer *parent = NULL;
615         struct extent_buffer *node = path->nodes[level];
616         struct btrfs_disk_key parent_key;
617         struct btrfs_disk_key node_key;
618         int parent_slot;
619         int slot;
620         struct btrfs_key cpukey;
621         u32 nritems = btrfs_header_nritems(node);
622
623         if (path->nodes[level + 1])
624                 parent = path->nodes[level + 1];
625
626         slot = path->slots[level];
627         BUG_ON(nritems == 0);
628         if (parent) {
629                 parent_slot = path->slots[level + 1];
630                 btrfs_node_key(parent, &parent_key, parent_slot);
631                 btrfs_node_key(node, &node_key, 0);
632                 BUG_ON(memcmp(&parent_key, &node_key,
633                               sizeof(struct btrfs_disk_key)));
634                 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
635                        btrfs_header_bytenr(node));
636         }
637         BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
638         if (slot != 0) {
639                 btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
640                 btrfs_node_key(node, &node_key, slot);
641                 BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
642         }
643         if (slot < nritems - 1) {
644                 btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
645                 btrfs_node_key(node, &node_key, slot);
646                 BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
647         }
648         return 0;
649 }
650
651 /*
652  * extra checking to make sure all the items in a leaf are
653  * well formed and in the proper order
654  */
655 static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
656                       int level)
657 {
658         struct extent_buffer *leaf = path->nodes[level];
659         struct extent_buffer *parent = NULL;
660         int parent_slot;
661         struct btrfs_key cpukey;
662         struct btrfs_disk_key parent_key;
663         struct btrfs_disk_key leaf_key;
664         int slot = path->slots[0];
665
666         u32 nritems = btrfs_header_nritems(leaf);
667
668         if (path->nodes[level + 1])
669                 parent = path->nodes[level + 1];
670
671         if (nritems == 0)
672                 return 0;
673
674         if (parent) {
675                 parent_slot = path->slots[level + 1];
676                 btrfs_node_key(parent, &parent_key, parent_slot);
677                 btrfs_item_key(leaf, &leaf_key, 0);
678
679                 BUG_ON(memcmp(&parent_key, &leaf_key,
680                        sizeof(struct btrfs_disk_key)));
681                 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
682                        btrfs_header_bytenr(leaf));
683         }
684 #if 0
685         for (i = 0; nritems > 1 && i < nritems - 2; i++) {
686                 btrfs_item_key_to_cpu(leaf, &cpukey, i + 1);
687                 btrfs_item_key(leaf, &leaf_key, i);
688                 if (comp_keys(&leaf_key, &cpukey) >= 0) {
689                         btrfs_print_leaf(root, leaf);
690                         printk("slot %d offset bad key\n", i);
691                         BUG_ON(1);
692                 }
693                 if (btrfs_item_offset_nr(leaf, i) !=
694                         btrfs_item_end_nr(leaf, i + 1)) {
695                         btrfs_print_leaf(root, leaf);
696                         printk("slot %d offset bad\n", i);
697                         BUG_ON(1);
698                 }
699                 if (i == 0) {
700                         if (btrfs_item_offset_nr(leaf, i) +
701                                btrfs_item_size_nr(leaf, i) !=
702                                BTRFS_LEAF_DATA_SIZE(root)) {
703                                 btrfs_print_leaf(root, leaf);
704                                 printk("slot %d first offset bad\n", i);
705                                 BUG_ON(1);
706                         }
707                 }
708         }
709         if (nritems > 0) {
710                 if (btrfs_item_size_nr(leaf, nritems - 1) > 4096) {
711                                 btrfs_print_leaf(root, leaf);
712                                 printk("slot %d bad size \n", nritems - 1);
713                                 BUG_ON(1);
714                 }
715         }
716 #endif
717         if (slot != 0 && slot < nritems - 1) {
718                 btrfs_item_key(leaf, &leaf_key, slot);
719                 btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
720                 if (comp_keys(&leaf_key, &cpukey) <= 0) {
721                         btrfs_print_leaf(root, leaf);
722                         printk("slot %d offset bad key\n", slot);
723                         BUG_ON(1);
724                 }
725                 if (btrfs_item_offset_nr(leaf, slot - 1) !=
726                        btrfs_item_end_nr(leaf, slot)) {
727                         btrfs_print_leaf(root, leaf);
728                         printk("slot %d offset bad\n", slot);
729                         BUG_ON(1);
730                 }
731         }
732         if (slot < nritems - 1) {
733                 btrfs_item_key(leaf, &leaf_key, slot);
734                 btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
735                 BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
736                 if (btrfs_item_offset_nr(leaf, slot) !=
737                         btrfs_item_end_nr(leaf, slot + 1)) {
738                         btrfs_print_leaf(root, leaf);
739                         printk("slot %d offset bad\n", slot);
740                         BUG_ON(1);
741                 }
742         }
743         BUG_ON(btrfs_item_offset_nr(leaf, 0) +
744                btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
745         return 0;
746 }
747
748 static int noinline check_block(struct btrfs_root *root,
749                                 struct btrfs_path *path, int level)
750 {
751         u64 found_start;
752         return 0;
753         if (btrfs_header_level(path->nodes[level]) != level)
754             printk("warning: bad level %Lu wanted %d found %d\n",
755                    path->nodes[level]->start, level,
756                    btrfs_header_level(path->nodes[level]));
757         found_start = btrfs_header_bytenr(path->nodes[level]);
758         if (found_start != path->nodes[level]->start) {
759             printk("warning: bad bytentr %Lu found %Lu\n",
760                    path->nodes[level]->start, found_start);
761         }
762 #if 0
763         struct extent_buffer *buf = path->nodes[level];
764
765         if (memcmp_extent_buffer(buf, root->fs_info->fsid,
766                                  (unsigned long)btrfs_header_fsid(buf),
767                                  BTRFS_FSID_SIZE)) {
768                 printk("warning bad block %Lu\n", buf->start);
769                 return 1;
770         }
771 #endif
772         if (level == 0)
773                 return check_leaf(root, path, level);
774         return check_node(root, path, level);
775 }
776
777 /*
778  * search for key in the extent_buffer.  The items start at offset p,
779  * and they are item_size apart.  There are 'max' items in p.
780  *
781  * the slot in the array is returned via slot, and it points to
782  * the place where you would insert key if it is not found in
783  * the array.
784  *
785  * slot may point to max if the key is bigger than all of the keys
786  */
787 static noinline int generic_bin_search(struct extent_buffer *eb,
788                                        unsigned long p,
789                                        int item_size, struct btrfs_key *key,
790                                        int max, int *slot)
791 {
792         int low = 0;
793         int high = max;
794         int mid;
795         int ret;
796         struct btrfs_disk_key *tmp = NULL;
797         struct btrfs_disk_key unaligned;
798         unsigned long offset;
799         char *map_token = NULL;
800         char *kaddr = NULL;
801         unsigned long map_start = 0;
802         unsigned long map_len = 0;
803         int err;
804
805         while(low < high) {
806                 mid = (low + high) / 2;
807                 offset = p + mid * item_size;
808
809                 if (!map_token || offset < map_start ||
810                     (offset + sizeof(struct btrfs_disk_key)) >
811                     map_start + map_len) {
812                         if (map_token) {
813                                 unmap_extent_buffer(eb, map_token, KM_USER0);
814                                 map_token = NULL;
815                         }
816
817                         err = map_private_extent_buffer(eb, offset,
818                                                 sizeof(struct btrfs_disk_key),
819                                                 &map_token, &kaddr,
820                                                 &map_start, &map_len, KM_USER0);
821
822                         if (!err) {
823                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
824                                                         map_start);
825                         } else {
826                                 read_extent_buffer(eb, &unaligned,
827                                                    offset, sizeof(unaligned));
828                                 tmp = &unaligned;
829                         }
830
831                 } else {
832                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
833                                                         map_start);
834                 }
835                 ret = comp_keys(tmp, key);
836
837                 if (ret < 0)
838                         low = mid + 1;
839                 else if (ret > 0)
840                         high = mid;
841                 else {
842                         *slot = mid;
843                         if (map_token)
844                                 unmap_extent_buffer(eb, map_token, KM_USER0);
845                         return 0;
846                 }
847         }
848         *slot = low;
849         if (map_token)
850                 unmap_extent_buffer(eb, map_token, KM_USER0);
851         return 1;
852 }
853
854 /*
855  * simple bin_search frontend that does the right thing for
856  * leaves vs nodes
857  */
858 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
859                       int level, int *slot)
860 {
861         if (level == 0) {
862                 return generic_bin_search(eb,
863                                           offsetof(struct btrfs_leaf, items),
864                                           sizeof(struct btrfs_item),
865                                           key, btrfs_header_nritems(eb),
866                                           slot);
867         } else {
868                 return generic_bin_search(eb,
869                                           offsetof(struct btrfs_node, ptrs),
870                                           sizeof(struct btrfs_key_ptr),
871                                           key, btrfs_header_nritems(eb),
872                                           slot);
873         }
874         return -1;
875 }
876
877 /* given a node and slot number, this reads the blocks it points to.  The
878  * extent buffer is returned with a reference taken (but unlocked).
879  * NULL is returned on error.
880  */
881 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
882                                    struct extent_buffer *parent, int slot)
883 {
884         int level = btrfs_header_level(parent);
885         if (slot < 0)
886                 return NULL;
887         if (slot >= btrfs_header_nritems(parent))
888                 return NULL;
889
890         BUG_ON(level == 0);
891
892         return read_tree_block(root, btrfs_node_blockptr(parent, slot),
893                        btrfs_level_size(root, level - 1),
894                        btrfs_node_ptr_generation(parent, slot));
895 }
896
897 /*
898  * node level balancing, used to make sure nodes are in proper order for
899  * item deletion.  We balance from the top down, so we have to make sure
900  * that a deletion won't leave an node completely empty later on.
901  */
902 static noinline int balance_level(struct btrfs_trans_handle *trans,
903                          struct btrfs_root *root,
904                          struct btrfs_path *path, int level)
905 {
906         struct extent_buffer *right = NULL;
907         struct extent_buffer *mid;
908         struct extent_buffer *left = NULL;
909         struct extent_buffer *parent = NULL;
910         int ret = 0;
911         int wret;
912         int pslot;
913         int orig_slot = path->slots[level];
914         int err_on_enospc = 0;
915         u64 orig_ptr;
916
917         if (level == 0)
918                 return 0;
919
920         mid = path->nodes[level];
921         WARN_ON(!path->locks[level]);
922         WARN_ON(btrfs_header_generation(mid) != trans->transid);
923
924         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
925
926         if (level < BTRFS_MAX_LEVEL - 1)
927                 parent = path->nodes[level + 1];
928         pslot = path->slots[level + 1];
929
930         /*
931          * deal with the case where there is only one pointer in the root
932          * by promoting the node below to a root
933          */
934         if (!parent) {
935                 struct extent_buffer *child;
936
937                 if (btrfs_header_nritems(mid) != 1)
938                         return 0;
939
940                 /* promote the child to a root */
941                 child = read_node_slot(root, mid, 0);
942                 btrfs_tree_lock(child);
943                 BUG_ON(!child);
944                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
945                 BUG_ON(ret);
946
947                 spin_lock(&root->node_lock);
948                 root->node = child;
949                 spin_unlock(&root->node_lock);
950
951                 ret = btrfs_update_extent_ref(trans, root, child->start,
952                                               mid->start, child->start,
953                                               root->root_key.objectid,
954                                               trans->transid, level - 1);
955                 BUG_ON(ret);
956
957                 add_root_to_dirty_list(root);
958                 btrfs_tree_unlock(child);
959                 path->locks[level] = 0;
960                 path->nodes[level] = NULL;
961                 clean_tree_block(trans, root, mid);
962                 btrfs_tree_unlock(mid);
963                 /* once for the path */
964                 free_extent_buffer(mid);
965                 ret = btrfs_free_extent(trans, root, mid->start, mid->len,
966                                         mid->start, root->root_key.objectid,
967                                         btrfs_header_generation(mid),
968                                         level, 1);
969                 /* once for the root ptr */
970                 free_extent_buffer(mid);
971                 return ret;
972         }
973         if (btrfs_header_nritems(mid) >
974             BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
975                 return 0;
976
977         if (btrfs_header_nritems(mid) < 2)
978                 err_on_enospc = 1;
979
980         left = read_node_slot(root, parent, pslot - 1);
981         if (left) {
982                 btrfs_tree_lock(left);
983                 wret = btrfs_cow_block(trans, root, left,
984                                        parent, pslot - 1, &left, 0);
985                 if (wret) {
986                         ret = wret;
987                         goto enospc;
988                 }
989         }
990         right = read_node_slot(root, parent, pslot + 1);
991         if (right) {
992                 btrfs_tree_lock(right);
993                 wret = btrfs_cow_block(trans, root, right,
994                                        parent, pslot + 1, &right, 0);
995                 if (wret) {
996                         ret = wret;
997                         goto enospc;
998                 }
999         }
1000
1001         /* first, try to make some room in the middle buffer */
1002         if (left) {
1003                 orig_slot += btrfs_header_nritems(left);
1004                 wret = push_node_left(trans, root, left, mid, 1);
1005                 if (wret < 0)
1006                         ret = wret;
1007                 if (btrfs_header_nritems(mid) < 2)
1008                         err_on_enospc = 1;
1009         }
1010
1011         /*
1012          * then try to empty the right most buffer into the middle
1013          */
1014         if (right) {
1015                 wret = push_node_left(trans, root, mid, right, 1);
1016                 if (wret < 0 && wret != -ENOSPC)
1017                         ret = wret;
1018                 if (btrfs_header_nritems(right) == 0) {
1019                         u64 bytenr = right->start;
1020                         u64 generation = btrfs_header_generation(parent);
1021                         u32 blocksize = right->len;
1022
1023                         clean_tree_block(trans, root, right);
1024                         btrfs_tree_unlock(right);
1025                         free_extent_buffer(right);
1026                         right = NULL;
1027                         wret = del_ptr(trans, root, path, level + 1, pslot +
1028                                        1);
1029                         if (wret)
1030                                 ret = wret;
1031                         wret = btrfs_free_extent(trans, root, bytenr,
1032                                                  blocksize, parent->start,
1033                                                  btrfs_header_owner(parent),
1034                                                  generation, level, 1);
1035                         if (wret)
1036                                 ret = wret;
1037                 } else {
1038                         struct btrfs_disk_key right_key;
1039                         btrfs_node_key(right, &right_key, 0);
1040                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1041                         btrfs_mark_buffer_dirty(parent);
1042                 }
1043         }
1044         if (btrfs_header_nritems(mid) == 1) {
1045                 /*
1046                  * we're not allowed to leave a node with one item in the
1047                  * tree during a delete.  A deletion from lower in the tree
1048                  * could try to delete the only pointer in this node.
1049                  * So, pull some keys from the left.
1050                  * There has to be a left pointer at this point because
1051                  * otherwise we would have pulled some pointers from the
1052                  * right
1053                  */
1054                 BUG_ON(!left);
1055                 wret = balance_node_right(trans, root, mid, left);
1056                 if (wret < 0) {
1057                         ret = wret;
1058                         goto enospc;
1059                 }
1060                 if (wret == 1) {
1061                         wret = push_node_left(trans, root, left, mid, 1);
1062                         if (wret < 0)
1063                                 ret = wret;
1064                 }
1065                 BUG_ON(wret == 1);
1066         }
1067         if (btrfs_header_nritems(mid) == 0) {
1068                 /* we've managed to empty the middle node, drop it */
1069                 u64 root_gen = btrfs_header_generation(parent);
1070                 u64 bytenr = mid->start;
1071                 u32 blocksize = mid->len;
1072
1073                 clean_tree_block(trans, root, mid);
1074                 btrfs_tree_unlock(mid);
1075                 free_extent_buffer(mid);
1076                 mid = NULL;
1077                 wret = del_ptr(trans, root, path, level + 1, pslot);
1078                 if (wret)
1079                         ret = wret;
1080                 wret = btrfs_free_extent(trans, root, bytenr, blocksize,
1081                                          parent->start,
1082                                          btrfs_header_owner(parent),
1083                                          root_gen, level, 1);
1084                 if (wret)
1085                         ret = wret;
1086         } else {
1087                 /* update the parent key to reflect our changes */
1088                 struct btrfs_disk_key mid_key;
1089                 btrfs_node_key(mid, &mid_key, 0);
1090                 btrfs_set_node_key(parent, &mid_key, pslot);
1091                 btrfs_mark_buffer_dirty(parent);
1092         }
1093
1094         /* update the path */
1095         if (left) {
1096                 if (btrfs_header_nritems(left) > orig_slot) {
1097                         extent_buffer_get(left);
1098                         /* left was locked after cow */
1099                         path->nodes[level] = left;
1100                         path->slots[level + 1] -= 1;
1101                         path->slots[level] = orig_slot;
1102                         if (mid) {
1103                                 btrfs_tree_unlock(mid);
1104                                 free_extent_buffer(mid);
1105                         }
1106                 } else {
1107                         orig_slot -= btrfs_header_nritems(left);
1108                         path->slots[level] = orig_slot;
1109                 }
1110         }
1111         /* double check we haven't messed things up */
1112         check_block(root, path, level);
1113         if (orig_ptr !=
1114             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1115                 BUG();
1116 enospc:
1117         if (right) {
1118                 btrfs_tree_unlock(right);
1119                 free_extent_buffer(right);
1120         }
1121         if (left) {
1122                 if (path->nodes[level] != left)
1123                         btrfs_tree_unlock(left);
1124                 free_extent_buffer(left);
1125         }
1126         return ret;
1127 }
1128
1129 /* Node balancing for insertion.  Here we only split or push nodes around
1130  * when they are completely full.  This is also done top down, so we
1131  * have to be pessimistic.
1132  */
1133 static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
1134                                           struct btrfs_root *root,
1135                                           struct btrfs_path *path, int level)
1136 {
1137         struct extent_buffer *right = NULL;
1138         struct extent_buffer *mid;
1139         struct extent_buffer *left = NULL;
1140         struct extent_buffer *parent = NULL;
1141         int ret = 0;
1142         int wret;
1143         int pslot;
1144         int orig_slot = path->slots[level];
1145         u64 orig_ptr;
1146
1147         if (level == 0)
1148                 return 1;
1149
1150         mid = path->nodes[level];
1151         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1152         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1153
1154         if (level < BTRFS_MAX_LEVEL - 1)
1155                 parent = path->nodes[level + 1];
1156         pslot = path->slots[level + 1];
1157
1158         if (!parent)
1159                 return 1;
1160
1161         left = read_node_slot(root, parent, pslot - 1);
1162
1163         /* first, try to make some room in the middle buffer */
1164         if (left) {
1165                 u32 left_nr;
1166
1167                 btrfs_tree_lock(left);
1168                 left_nr = btrfs_header_nritems(left);
1169                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1170                         wret = 1;
1171                 } else {
1172                         ret = btrfs_cow_block(trans, root, left, parent,
1173                                               pslot - 1, &left, 0);
1174                         if (ret)
1175                                 wret = 1;
1176                         else {
1177                                 wret = push_node_left(trans, root,
1178                                                       left, mid, 0);
1179                         }
1180                 }
1181                 if (wret < 0)
1182                         ret = wret;
1183                 if (wret == 0) {
1184                         struct btrfs_disk_key disk_key;
1185                         orig_slot += left_nr;
1186                         btrfs_node_key(mid, &disk_key, 0);
1187                         btrfs_set_node_key(parent, &disk_key, pslot);
1188                         btrfs_mark_buffer_dirty(parent);
1189                         if (btrfs_header_nritems(left) > orig_slot) {
1190                                 path->nodes[level] = left;
1191                                 path->slots[level + 1] -= 1;
1192                                 path->slots[level] = orig_slot;
1193                                 btrfs_tree_unlock(mid);
1194                                 free_extent_buffer(mid);
1195                         } else {
1196                                 orig_slot -=
1197                                         btrfs_header_nritems(left);
1198                                 path->slots[level] = orig_slot;
1199                                 btrfs_tree_unlock(left);
1200                                 free_extent_buffer(left);
1201                         }
1202                         return 0;
1203                 }
1204                 btrfs_tree_unlock(left);
1205                 free_extent_buffer(left);
1206         }
1207         right = read_node_slot(root, parent, pslot + 1);
1208
1209         /*
1210          * then try to empty the right most buffer into the middle
1211          */
1212         if (right) {
1213                 u32 right_nr;
1214                 btrfs_tree_lock(right);
1215                 right_nr = btrfs_header_nritems(right);
1216                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1217                         wret = 1;
1218                 } else {
1219                         ret = btrfs_cow_block(trans, root, right,
1220                                               parent, pslot + 1,
1221                                               &right, 0);
1222                         if (ret)
1223                                 wret = 1;
1224                         else {
1225                                 wret = balance_node_right(trans, root,
1226                                                           right, mid);
1227                         }
1228                 }
1229                 if (wret < 0)
1230                         ret = wret;
1231                 if (wret == 0) {
1232                         struct btrfs_disk_key disk_key;
1233
1234                         btrfs_node_key(right, &disk_key, 0);
1235                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
1236                         btrfs_mark_buffer_dirty(parent);
1237
1238                         if (btrfs_header_nritems(mid) <= orig_slot) {
1239                                 path->nodes[level] = right;
1240                                 path->slots[level + 1] += 1;
1241                                 path->slots[level] = orig_slot -
1242                                         btrfs_header_nritems(mid);
1243                                 btrfs_tree_unlock(mid);
1244                                 free_extent_buffer(mid);
1245                         } else {
1246                                 btrfs_tree_unlock(right);
1247                                 free_extent_buffer(right);
1248                         }
1249                         return 0;
1250                 }
1251                 btrfs_tree_unlock(right);
1252                 free_extent_buffer(right);
1253         }
1254         return 1;
1255 }
1256
1257 /*
1258  * readahead one full node of leaves, finding things that are close
1259  * to the block in 'slot', and triggering ra on them.
1260  */
1261 static noinline void reada_for_search(struct btrfs_root *root,
1262                                       struct btrfs_path *path,
1263                                       int level, int slot, u64 objectid)
1264 {
1265         struct extent_buffer *node;
1266         struct btrfs_disk_key disk_key;
1267         u32 nritems;
1268         u64 search;
1269         u64 lowest_read;
1270         u64 highest_read;
1271         u64 nread = 0;
1272         int direction = path->reada;
1273         struct extent_buffer *eb;
1274         u32 nr;
1275         u32 blocksize;
1276         u32 nscan = 0;
1277
1278         if (level != 1)
1279                 return;
1280
1281         if (!path->nodes[level])
1282                 return;
1283
1284         node = path->nodes[level];
1285
1286         search = btrfs_node_blockptr(node, slot);
1287         blocksize = btrfs_level_size(root, level - 1);
1288         eb = btrfs_find_tree_block(root, search, blocksize);
1289         if (eb) {
1290                 free_extent_buffer(eb);
1291                 return;
1292         }
1293
1294         highest_read = search;
1295         lowest_read = search;
1296
1297         nritems = btrfs_header_nritems(node);
1298         nr = slot;
1299         while(1) {
1300                 if (direction < 0) {
1301                         if (nr == 0)
1302                                 break;
1303                         nr--;
1304                 } else if (direction > 0) {
1305                         nr++;
1306                         if (nr >= nritems)
1307                                 break;
1308                 }
1309                 if (path->reada < 0 && objectid) {
1310                         btrfs_node_key(node, &disk_key, nr);
1311                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
1312                                 break;
1313                 }
1314                 search = btrfs_node_blockptr(node, nr);
1315                 if ((search >= lowest_read && search <= highest_read) ||
1316                     (search < lowest_read && lowest_read - search <= 16384) ||
1317                     (search > highest_read && search - highest_read <= 16384)) {
1318                         readahead_tree_block(root, search, blocksize,
1319                                      btrfs_node_ptr_generation(node, nr));
1320                         nread += blocksize;
1321                 }
1322                 nscan++;
1323                 if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
1324                         break;
1325                 if(nread > (256 * 1024) || nscan > 128)
1326                         break;
1327
1328                 if (search < lowest_read)
1329                         lowest_read = search;
1330                 if (search > highest_read)
1331                         highest_read = search;
1332         }
1333 }
1334
1335 /*
1336  * when we walk down the tree, it is usually safe to unlock the higher layers in
1337  * the tree.  The exceptions are when our path goes through slot 0, because operations
1338  * on the tree might require changing key pointers higher up in the tree.
1339  *
1340  * callers might also have set path->keep_locks, which tells this code to
1341  * keep the lock if the path points to the last slot in the block.  This is
1342  * part of walking through the tree, and selecting the next slot in the higher
1343  * block.
1344  *
1345  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.
1346  * so if lowest_unlock is 1, level 0 won't be unlocked
1347  */
1348 static noinline void unlock_up(struct btrfs_path *path, int level,
1349                                int lowest_unlock)
1350 {
1351         int i;
1352         int skip_level = level;
1353         int no_skips = 0;
1354         struct extent_buffer *t;
1355
1356         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1357                 if (!path->nodes[i])
1358                         break;
1359                 if (!path->locks[i])
1360                         break;
1361                 if (!no_skips && path->slots[i] == 0) {
1362                         skip_level = i + 1;
1363                         continue;
1364                 }
1365                 if (!no_skips && path->keep_locks) {
1366                         u32 nritems;
1367                         t = path->nodes[i];
1368                         nritems = btrfs_header_nritems(t);
1369                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
1370                                 skip_level = i + 1;
1371                                 continue;
1372                         }
1373                 }
1374                 if (skip_level < i && i >= lowest_unlock)
1375                         no_skips = 1;
1376
1377                 t = path->nodes[i];
1378                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1379                         btrfs_tree_unlock(t);
1380                         path->locks[i] = 0;
1381                 }
1382         }
1383 }
1384
1385 /*
1386  * look for key in the tree.  path is filled in with nodes along the way
1387  * if key is found, we return zero and you can find the item in the leaf
1388  * level of the path (level 0)
1389  *
1390  * If the key isn't found, the path points to the slot where it should
1391  * be inserted, and 1 is returned.  If there are other errors during the
1392  * search a negative error number is returned.
1393  *
1394  * if ins_len > 0, nodes and leaves will be split as we walk down the
1395  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
1396  * possible)
1397  */
1398 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1399                       *root, struct btrfs_key *key, struct btrfs_path *p, int
1400                       ins_len, int cow)
1401 {
1402         struct extent_buffer *b;
1403         struct extent_buffer *tmp;
1404         int slot;
1405         int ret;
1406         int level;
1407         int should_reada = p->reada;
1408         int lowest_unlock = 1;
1409         int blocksize;
1410         u8 lowest_level = 0;
1411         u64 blocknr;
1412         u64 gen;
1413         struct btrfs_key prealloc_block;
1414
1415         lowest_level = p->lowest_level;
1416         WARN_ON(lowest_level && ins_len > 0);
1417         WARN_ON(p->nodes[0] != NULL);
1418
1419         if (ins_len < 0)
1420                 lowest_unlock = 2;
1421
1422         prealloc_block.objectid = 0;
1423
1424 again:
1425         if (p->skip_locking)
1426                 b = btrfs_root_node(root);
1427         else
1428                 b = btrfs_lock_root_node(root);
1429
1430         while (b) {
1431                 level = btrfs_header_level(b);
1432
1433                 /*
1434                  * setup the path here so we can release it under lock
1435                  * contention with the cow code
1436                  */
1437                 p->nodes[level] = b;
1438                 if (!p->skip_locking)
1439                         p->locks[level] = 1;
1440
1441                 if (cow) {
1442                         int wret;
1443
1444                         /* is a cow on this block not required */
1445                         spin_lock(&root->fs_info->hash_lock);
1446                         if (btrfs_header_generation(b) == trans->transid &&
1447                             btrfs_header_owner(b) == root->root_key.objectid &&
1448                             !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1449                                 spin_unlock(&root->fs_info->hash_lock);
1450                                 goto cow_done;
1451                         }
1452                         spin_unlock(&root->fs_info->hash_lock);
1453
1454                         /* ok, we have to cow, is our old prealloc the right
1455                          * size?
1456                          */
1457                         if (prealloc_block.objectid &&
1458                             prealloc_block.offset != b->len) {
1459                                 btrfs_free_reserved_extent(root,
1460                                            prealloc_block.objectid,
1461                                            prealloc_block.offset);
1462                                 prealloc_block.objectid = 0;
1463                         }
1464
1465                         /*
1466                          * for higher level blocks, try not to allocate blocks
1467                          * with the block and the parent locks held.
1468                          */
1469                         if (level > 1 && !prealloc_block.objectid &&
1470                             btrfs_path_lock_waiting(p, level)) {
1471                                 u32 size = b->len;
1472                                 u64 hint = b->start;
1473
1474                                 btrfs_release_path(root, p);
1475                                 ret = btrfs_reserve_extent(trans, root,
1476                                                            size, size, 0,
1477                                                            hint, (u64)-1,
1478                                                            &prealloc_block, 0);
1479                                 BUG_ON(ret);
1480                                 goto again;
1481                         }
1482
1483                         wret = btrfs_cow_block(trans, root, b,
1484                                                p->nodes[level + 1],
1485                                                p->slots[level + 1],
1486                                                &b, prealloc_block.objectid);
1487                         prealloc_block.objectid = 0;
1488                         if (wret) {
1489                                 free_extent_buffer(b);
1490                                 ret = wret;
1491                                 goto done;
1492                         }
1493                 }
1494 cow_done:
1495                 BUG_ON(!cow && ins_len);
1496                 if (level != btrfs_header_level(b))
1497                         WARN_ON(1);
1498                 level = btrfs_header_level(b);
1499
1500                 p->nodes[level] = b;
1501                 if (!p->skip_locking)
1502                         p->locks[level] = 1;
1503
1504                 ret = check_block(root, p, level);
1505                 if (ret) {
1506                         ret = -1;
1507                         goto done;
1508                 }
1509
1510                 ret = bin_search(b, key, level, &slot);
1511                 if (level != 0) {
1512                         if (ret && slot > 0)
1513                                 slot -= 1;
1514                         p->slots[level] = slot;
1515                         if (ins_len > 0 && btrfs_header_nritems(b) >=
1516                             BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1517                                 int sret = split_node(trans, root, p, level);
1518                                 BUG_ON(sret > 0);
1519                                 if (sret) {
1520                                         ret = sret;
1521                                         goto done;
1522                                 }
1523                                 b = p->nodes[level];
1524                                 slot = p->slots[level];
1525                         } else if (ins_len < 0) {
1526                                 int sret = balance_level(trans, root, p,
1527                                                          level);
1528                                 if (sret) {
1529                                         ret = sret;
1530                                         goto done;
1531                                 }
1532                                 b = p->nodes[level];
1533                                 if (!b) {
1534                                         btrfs_release_path(NULL, p);
1535                                         goto again;
1536                                 }
1537                                 slot = p->slots[level];
1538                                 BUG_ON(btrfs_header_nritems(b) == 1);
1539                         }
1540                         unlock_up(p, level, lowest_unlock);
1541
1542                         /* this is only true while dropping a snapshot */
1543                         if (level == lowest_level) {
1544                                 ret = 0;
1545                                 goto done;
1546                         }
1547
1548                         blocknr = btrfs_node_blockptr(b, slot);
1549                         gen = btrfs_node_ptr_generation(b, slot);
1550                         blocksize = btrfs_level_size(root, level - 1);
1551
1552                         tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1553                         if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1554                                 b = tmp;
1555                         } else {
1556                                 /*
1557                                  * reduce lock contention at high levels
1558                                  * of the btree by dropping locks before
1559                                  * we read.
1560                                  */
1561                                 if (level > 1) {
1562                                         btrfs_release_path(NULL, p);
1563                                         if (tmp)
1564                                                 free_extent_buffer(tmp);
1565                                         if (should_reada)
1566                                                 reada_for_search(root, p,
1567                                                                  level, slot,
1568                                                                  key->objectid);
1569
1570                                         tmp = read_tree_block(root, blocknr,
1571                                                          blocksize, gen);
1572                                         if (tmp)
1573                                                 free_extent_buffer(tmp);
1574                                         goto again;
1575                                 } else {
1576                                         if (tmp)
1577                                                 free_extent_buffer(tmp);
1578                                         if (should_reada)
1579                                                 reada_for_search(root, p,
1580                                                                  level, slot,
1581                                                                  key->objectid);
1582                                         b = read_node_slot(root, b, slot);
1583                                 }
1584                         }
1585                         if (!p->skip_locking)
1586                                 btrfs_tree_lock(b);
1587                 } else {
1588                         p->slots[level] = slot;
1589                         if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
1590                             sizeof(struct btrfs_item) + ins_len) {
1591                                 int sret = split_leaf(trans, root, key,
1592                                                       p, ins_len, ret == 0);
1593                                 BUG_ON(sret > 0);
1594                                 if (sret) {
1595                                         ret = sret;
1596                                         goto done;
1597                                 }
1598                         }
1599                         unlock_up(p, level, lowest_unlock);
1600                         goto done;
1601                 }
1602         }
1603         ret = 1;
1604 done:
1605         if (prealloc_block.objectid) {
1606                 btrfs_free_reserved_extent(root,
1607                            prealloc_block.objectid,
1608                            prealloc_block.offset);
1609         }
1610
1611         return ret;
1612 }
1613
1614 int btrfs_merge_path(struct btrfs_trans_handle *trans,
1615                      struct btrfs_root *root,
1616                      struct btrfs_key *node_keys,
1617                      u64 *nodes, int lowest_level)
1618 {
1619         struct extent_buffer *eb;
1620         struct extent_buffer *parent;
1621         struct btrfs_key key;
1622         u64 bytenr;
1623         u64 generation;
1624         u32 blocksize;
1625         int level;
1626         int slot;
1627         int key_match;
1628         int ret;
1629
1630         eb = btrfs_lock_root_node(root);
1631         ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1632         BUG_ON(ret);
1633
1634         parent = eb;
1635         while (1) {
1636                 level = btrfs_header_level(parent);
1637                 if (level == 0 || level <= lowest_level)
1638                         break;
1639
1640                 ret = bin_search(parent, &node_keys[lowest_level], level,
1641                                  &slot);
1642                 if (ret && slot > 0)
1643                         slot--;
1644
1645                 bytenr = btrfs_node_blockptr(parent, slot);
1646                 if (nodes[level - 1] == bytenr)
1647                         break;
1648
1649                 blocksize = btrfs_level_size(root, level - 1);
1650                 generation = btrfs_node_ptr_generation(parent, slot);
1651                 btrfs_node_key_to_cpu(eb, &key, slot);
1652                 key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
1653
1654                 if (generation == trans->transid) {
1655                         eb = read_tree_block(root, bytenr, blocksize,
1656                                              generation);
1657                         btrfs_tree_lock(eb);
1658                 }
1659
1660                 /*
1661                  * if node keys match and node pointer hasn't been modified
1662                  * in the running transaction, we can merge the path. for
1663                  * blocks owened by reloc trees, the node pointer check is
1664                  * skipped, this is because these blocks are fully controlled
1665                  * by the space balance code, no one else can modify them.
1666                  */
1667                 if (!nodes[level - 1] || !key_match ||
1668                     (generation == trans->transid &&
1669                      btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
1670                         if (level == 1 || level == lowest_level + 1) {
1671                                 if (generation == trans->transid) {
1672                                         btrfs_tree_unlock(eb);
1673                                         free_extent_buffer(eb);
1674                                 }
1675                                 break;
1676                         }
1677
1678                         if (generation != trans->transid) {
1679                                 eb = read_tree_block(root, bytenr, blocksize,
1680                                                 generation);
1681                                 btrfs_tree_lock(eb);
1682                         }
1683
1684                         ret = btrfs_cow_block(trans, root, eb, parent, slot,
1685                                               &eb, 0);
1686                         BUG_ON(ret);
1687
1688                         if (root->root_key.objectid ==
1689                             BTRFS_TREE_RELOC_OBJECTID) {
1690                                 if (!nodes[level - 1]) {
1691                                         nodes[level - 1] = eb->start;
1692                                         memcpy(&node_keys[level - 1], &key,
1693                                                sizeof(node_keys[0]));
1694                                 } else {
1695                                         WARN_ON(1);
1696                                 }
1697                         }
1698
1699                         btrfs_tree_unlock(parent);
1700                         free_extent_buffer(parent);
1701                         parent = eb;
1702                         continue;
1703                 }
1704
1705                 btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
1706                 btrfs_set_node_ptr_generation(parent, slot, trans->transid);
1707                 btrfs_mark_buffer_dirty(parent);
1708
1709                 ret = btrfs_inc_extent_ref(trans, root,
1710                                         nodes[level - 1],
1711                                         blocksize, parent->start,
1712                                         btrfs_header_owner(parent),
1713                                         btrfs_header_generation(parent),
1714                                         level - 1);
1715                 BUG_ON(ret);
1716
1717                 /*
1718                  * If the block was created in the running transaction,
1719                  * it's possible this is the last reference to it, so we
1720                  * should drop the subtree.
1721                  */
1722                 if (generation == trans->transid) {
1723                         ret = btrfs_drop_subtree(trans, root, eb, parent);
1724                         BUG_ON(ret);
1725                         btrfs_tree_unlock(eb);
1726                         free_extent_buffer(eb);
1727                 } else {
1728                         ret = btrfs_free_extent(trans, root, bytenr,
1729                                         blocksize, parent->start,
1730                                         btrfs_header_owner(parent),
1731                                         btrfs_header_generation(parent),
1732                                         level - 1, 1);
1733                         BUG_ON(ret);
1734                 }
1735                 break;
1736         }
1737         btrfs_tree_unlock(parent);
1738         free_extent_buffer(parent);
1739         return 0;
1740 }
1741
1742 /*
1743  * adjust the pointers going up the tree, starting at level
1744  * making sure the right key of each node is points to 'key'.
1745  * This is used after shifting pointers to the left, so it stops
1746  * fixing up pointers when a given leaf/node is not in slot 0 of the
1747  * higher levels
1748  *
1749  * If this fails to write a tree block, it returns -1, but continues
1750  * fixing up the blocks in ram so the tree is consistent.
1751  */
1752 static int fixup_low_keys(struct btrfs_trans_handle *trans,
1753                           struct btrfs_root *root, struct btrfs_path *path,
1754                           struct btrfs_disk_key *key, int level)
1755 {
1756         int i;
1757         int ret = 0;
1758         struct extent_buffer *t;
1759
1760         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1761                 int tslot = path->slots[i];
1762                 if (!path->nodes[i])
1763                         break;
1764                 t = path->nodes[i];
1765                 btrfs_set_node_key(t, key, tslot);
1766                 btrfs_mark_buffer_dirty(path->nodes[i]);
1767                 if (tslot != 0)
1768                         break;
1769         }
1770         return ret;
1771 }
1772
1773 /*
1774  * update item key.
1775  *
1776  * This function isn't completely safe. It's the caller's responsibility
1777  * that the new key won't break the order
1778  */
1779 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1780                             struct btrfs_root *root, struct btrfs_path *path,
1781                             struct btrfs_key *new_key)
1782 {
1783         struct btrfs_disk_key disk_key;
1784         struct extent_buffer *eb;
1785         int slot;
1786
1787         eb = path->nodes[0];
1788         slot = path->slots[0];
1789         if (slot > 0) {
1790                 btrfs_item_key(eb, &disk_key, slot - 1);
1791                 if (comp_keys(&disk_key, new_key) >= 0)
1792                         return -1;
1793         }
1794         if (slot < btrfs_header_nritems(eb) - 1) {
1795                 btrfs_item_key(eb, &disk_key, slot + 1);
1796                 if (comp_keys(&disk_key, new_key) <= 0)
1797                         return -1;
1798         }
1799
1800         btrfs_cpu_key_to_disk(&disk_key, new_key);
1801         btrfs_set_item_key(eb, &disk_key, slot);
1802         btrfs_mark_buffer_dirty(eb);
1803         if (slot == 0)
1804                 fixup_low_keys(trans, root, path, &disk_key, 1);
1805         return 0;
1806 }
1807
1808 /*
1809  * try to push data from one node into the next node left in the
1810  * tree.
1811  *
1812  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1813  * error, and > 0 if there was no room in the left hand block.
1814  */
1815 static int push_node_left(struct btrfs_trans_handle *trans,
1816                           struct btrfs_root *root, struct extent_buffer *dst,
1817                           struct extent_buffer *src, int empty)
1818 {
1819         int push_items = 0;
1820         int src_nritems;
1821         int dst_nritems;
1822         int ret = 0;
1823
1824         src_nritems = btrfs_header_nritems(src);
1825         dst_nritems = btrfs_header_nritems(dst);
1826         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1827         WARN_ON(btrfs_header_generation(src) != trans->transid);
1828         WARN_ON(btrfs_header_generation(dst) != trans->transid);
1829
1830         if (!empty && src_nritems <= 8)
1831                 return 1;
1832
1833         if (push_items <= 0) {
1834                 return 1;
1835         }
1836
1837         if (empty) {
1838                 push_items = min(src_nritems, push_items);
1839                 if (push_items < src_nritems) {
1840                         /* leave at least 8 pointers in the node if
1841                          * we aren't going to empty it
1842                          */
1843                         if (src_nritems - push_items < 8) {
1844                                 if (push_items <= 8)
1845                                         return 1;
1846                                 push_items -= 8;
1847                         }
1848                 }
1849         } else
1850                 push_items = min(src_nritems - 8, push_items);
1851
1852         copy_extent_buffer(dst, src,
1853                            btrfs_node_key_ptr_offset(dst_nritems),
1854                            btrfs_node_key_ptr_offset(0),
1855                            push_items * sizeof(struct btrfs_key_ptr));
1856
1857         if (push_items < src_nritems) {
1858                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
1859                                       btrfs_node_key_ptr_offset(push_items),
1860                                       (src_nritems - push_items) *
1861                                       sizeof(struct btrfs_key_ptr));
1862         }
1863         btrfs_set_header_nritems(src, src_nritems - push_items);
1864         btrfs_set_header_nritems(dst, dst_nritems + push_items);
1865         btrfs_mark_buffer_dirty(src);
1866         btrfs_mark_buffer_dirty(dst);
1867
1868         ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
1869         BUG_ON(ret);
1870
1871         return ret;
1872 }
1873
1874 /*
1875  * try to push data from one node into the next node right in the
1876  * tree.
1877  *
1878  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
1879  * error, and > 0 if there was no room in the right hand block.
1880  *
1881  * this will  only push up to 1/2 the contents of the left node over
1882  */
1883 static int balance_node_right(struct btrfs_trans_handle *trans,
1884                               struct btrfs_root *root,
1885                               struct extent_buffer *dst,
1886                               struct extent_buffer *src)
1887 {
1888         int push_items = 0;
1889         int max_push;
1890         int src_nritems;
1891         int dst_nritems;
1892         int ret = 0;
1893
1894         WARN_ON(btrfs_header_generation(src) != trans->transid);
1895         WARN_ON(btrfs_header_generation(dst) != trans->transid);
1896
1897         src_nritems = btrfs_header_nritems(src);
1898         dst_nritems = btrfs_header_nritems(dst);
1899         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1900         if (push_items <= 0) {
1901                 return 1;
1902         }
1903
1904         if (src_nritems < 4) {
1905                 return 1;
1906         }
1907
1908         max_push = src_nritems / 2 + 1;
1909         /* don't try to empty the node */
1910         if (max_push >= src_nritems) {
1911                 return 1;
1912         }
1913
1914         if (max_push < push_items)
1915                 push_items = max_push;
1916
1917         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
1918                                       btrfs_node_key_ptr_offset(0),
1919                                       (dst_nritems) *
1920                                       sizeof(struct btrfs_key_ptr));
1921
1922         copy_extent_buffer(dst, src,
1923                            btrfs_node_key_ptr_offset(0),
1924                            btrfs_node_key_ptr_offset(src_nritems - push_items),
1925                            push_items * sizeof(struct btrfs_key_ptr));
1926
1927         btrfs_set_header_nritems(src, src_nritems - push_items);
1928         btrfs_set_header_nritems(dst, dst_nritems + push_items);
1929
1930         btrfs_mark_buffer_dirty(src);
1931         btrfs_mark_buffer_dirty(dst);
1932
1933         ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
1934         BUG_ON(ret);
1935
1936         return ret;
1937 }
1938
1939 /*
1940  * helper function to insert a new root level in the tree.
1941  * A new node is allocated, and a single item is inserted to
1942  * point to the existing root
1943  *
1944  * returns zero on success or < 0 on failure.
1945  */
1946 static int noinline insert_new_root(struct btrfs_trans_handle *trans,
1947                            struct btrfs_root *root,
1948                            struct btrfs_path *path, int level)
1949 {
1950         u64 lower_gen;
1951         struct extent_buffer *lower;
1952         struct extent_buffer *c;
1953         struct extent_buffer *old;
1954         struct btrfs_disk_key lower_key;
1955         int ret;
1956
1957         BUG_ON(path->nodes[level]);
1958         BUG_ON(path->nodes[level-1] != root->node);
1959
1960         lower = path->nodes[level-1];
1961         if (level == 1)
1962                 btrfs_item_key(lower, &lower_key, 0);
1963         else
1964                 btrfs_node_key(lower, &lower_key, 0);
1965
1966         c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
1967                                    root->root_key.objectid, trans->transid,
1968                                    level, root->node->start, 0);
1969         if (IS_ERR(c))
1970                 return PTR_ERR(c);
1971
1972         memset_extent_buffer(c, 0, 0, root->nodesize);
1973         btrfs_set_header_nritems(c, 1);
1974         btrfs_set_header_level(c, level);
1975         btrfs_set_header_bytenr(c, c->start);
1976         btrfs_set_header_generation(c, trans->transid);
1977         btrfs_set_header_owner(c, root->root_key.objectid);
1978
1979         write_extent_buffer(c, root->fs_info->fsid,
1980                             (unsigned long)btrfs_header_fsid(c),
1981                             BTRFS_FSID_SIZE);
1982
1983         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
1984                             (unsigned long)btrfs_header_chunk_tree_uuid(c),
1985                             BTRFS_UUID_SIZE);
1986
1987         btrfs_set_node_key(c, &lower_key, 0);
1988         btrfs_set_node_blockptr(c, 0, lower->start);
1989         lower_gen = btrfs_header_generation(lower);
1990         WARN_ON(lower_gen != trans->transid);
1991
1992         btrfs_set_node_ptr_generation(c, 0, lower_gen);
1993
1994         btrfs_mark_buffer_dirty(c);
1995
1996         spin_lock(&root->node_lock);
1997         old = root->node;
1998         root->node = c;
1999         spin_unlock(&root->node_lock);
2000
2001         ret = btrfs_update_extent_ref(trans, root, lower->start,
2002                                       lower->start, c->start,
2003                                       root->root_key.objectid,
2004                                       trans->transid, level - 1);
2005         BUG_ON(ret);
2006
2007         /* the super has an extra ref to root->node */
2008         free_extent_buffer(old);
2009
2010         add_root_to_dirty_list(root);
2011         extent_buffer_get(c);
2012         path->nodes[level] = c;
2013         path->locks[level] = 1;
2014         path->slots[level] = 0;
2015         return 0;
2016 }
2017
2018 /*
2019  * worker function to insert a single pointer in a node.
2020  * the node should have enough room for the pointer already
2021  *
2022  * slot and level indicate where you want the key to go, and
2023  * blocknr is the block the key points to.
2024  *
2025  * returns zero on success and < 0 on any error
2026  */
2027 static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2028                       *root, struct btrfs_path *path, struct btrfs_disk_key
2029                       *key, u64 bytenr, int slot, int level)
2030 {
2031         struct extent_buffer *lower;
2032         int nritems;
2033
2034         BUG_ON(!path->nodes[level]);
2035         lower = path->nodes[level];
2036         nritems = btrfs_header_nritems(lower);
2037         if (slot > nritems)
2038                 BUG();
2039         if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2040                 BUG();
2041         if (slot != nritems) {
2042                 memmove_extent_buffer(lower,
2043                               btrfs_node_key_ptr_offset(slot + 1),
2044                               btrfs_node_key_ptr_offset(slot),
2045                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
2046         }
2047         btrfs_set_node_key(lower, key, slot);
2048         btrfs_set_node_blockptr(lower, slot, bytenr);
2049         WARN_ON(trans->transid == 0);
2050         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2051         btrfs_set_header_nritems(lower, nritems + 1);
2052         btrfs_mark_buffer_dirty(lower);
2053         return 0;
2054 }
2055
2056 /*
2057  * split the node at the specified level in path in two.
2058  * The path is corrected to point to the appropriate node after the split
2059  *
2060  * Before splitting this tries to make some room in the node by pushing
2061  * left and right, if either one works, it returns right away.
2062  *
2063  * returns 0 on success and < 0 on failure
2064  */
2065 static noinline int split_node(struct btrfs_trans_handle *trans,
2066                                struct btrfs_root *root,
2067                                struct btrfs_path *path, int level)
2068 {
2069         struct extent_buffer *c;
2070         struct extent_buffer *split;
2071         struct btrfs_disk_key disk_key;
2072         int mid;
2073         int ret;
2074         int wret;
2075         u32 c_nritems;
2076
2077         c = path->nodes[level];
2078         WARN_ON(btrfs_header_generation(c) != trans->transid);
2079         if (c == root->node) {
2080                 /* trying to split the root, lets make a new one */
2081                 ret = insert_new_root(trans, root, path, level + 1);
2082                 if (ret)
2083                         return ret;
2084         } else {
2085                 ret = push_nodes_for_insert(trans, root, path, level);
2086                 c = path->nodes[level];
2087                 if (!ret && btrfs_header_nritems(c) <
2088                     BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2089                         return 0;
2090                 if (ret < 0)
2091                         return ret;
2092         }
2093
2094         c_nritems = btrfs_header_nritems(c);
2095
2096         split = btrfs_alloc_free_block(trans, root, root->nodesize,
2097                                         path->nodes[level + 1]->start,
2098                                         root->root_key.objectid,
2099                                         trans->transid, level, c->start, 0);
2100         if (IS_ERR(split))
2101                 return PTR_ERR(split);
2102
2103         btrfs_set_header_flags(split, btrfs_header_flags(c));
2104         btrfs_set_header_level(split, btrfs_header_level(c));
2105         btrfs_set_header_bytenr(split, split->start);
2106         btrfs_set_header_generation(split, trans->transid);
2107         btrfs_set_header_owner(split, root->root_key.objectid);
2108         btrfs_set_header_flags(split, 0);
2109         write_extent_buffer(split, root->fs_info->fsid,
2110                             (unsigned long)btrfs_header_fsid(split),
2111                             BTRFS_FSID_SIZE);
2112         write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2113                             (unsigned long)btrfs_header_chunk_tree_uuid(split),
2114                             BTRFS_UUID_SIZE);
2115
2116         mid = (c_nritems + 1) / 2;
2117
2118         copy_extent_buffer(split, c,
2119                            btrfs_node_key_ptr_offset(0),
2120                            btrfs_node_key_ptr_offset(mid),
2121                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2122         btrfs_set_header_nritems(split, c_nritems - mid);
2123         btrfs_set_header_nritems(c, mid);
2124         ret = 0;
2125
2126         btrfs_mark_buffer_dirty(c);
2127         btrfs_mark_buffer_dirty(split);
2128
2129         btrfs_node_key(split, &disk_key, 0);
2130         wret = insert_ptr(trans, root, path, &disk_key, split->start,
2131                           path->slots[level + 1] + 1,
2132                           level + 1);
2133         if (wret)
2134                 ret = wret;
2135
2136         ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
2137         BUG_ON(ret);
2138
2139         if (path->slots[level] >= mid) {
2140                 path->slots[level] -= mid;
2141                 btrfs_tree_unlock(c);
2142                 free_extent_buffer(c);
2143                 path->nodes[level] = split;
2144                 path->slots[level + 1] += 1;
2145         } else {
2146                 btrfs_tree_unlock(split);
2147                 free_extent_buffer(split);
2148         }
2149         return ret;
2150 }
2151
2152 /*
2153  * how many bytes are required to store the items in a leaf.  start
2154  * and nr indicate which items in the leaf to check.  This totals up the
2155  * space used both by the item structs and the item data
2156  */
2157 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2158 {
2159         int data_len;
2160         int nritems = btrfs_header_nritems(l);
2161         int end = min(nritems, start + nr) - 1;
2162
2163         if (!nr)
2164                 return 0;
2165         data_len = btrfs_item_end_nr(l, start);
2166         data_len = data_len - btrfs_item_offset_nr(l, end);
2167         data_len += sizeof(struct btrfs_item) * nr;
2168         WARN_ON(data_len < 0);
2169         return data_len;
2170 }
2171
2172 /*
2173  * The space between the end of the leaf items and
2174  * the start of the leaf data.  IOW, how much room
2175  * the leaf has left for both items and data
2176  */
2177 int noinline btrfs_leaf_free_space(struct btrfs_root *root,
2178                                    struct extent_buffer *leaf)
2179 {
2180         int nritems = btrfs_header_nritems(leaf);
2181         int ret;
2182         ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2183         if (ret < 0) {
2184                 printk("leaf free space ret %d, leaf data size %lu, used %d nritems %d\n",
2185                        ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2186                        leaf_space_used(leaf, 0, nritems), nritems);
2187         }
2188         return ret;
2189 }
2190
2191 /*
2192  * push some data in the path leaf to the right, trying to free up at
2193  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2194  *
2195  * returns 1 if the push failed because the other node didn't have enough
2196  * room, 0 if everything worked out and < 0 if there were major errors.
2197  */
2198 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2199                            *root, struct btrfs_path *path, int data_size,
2200                            int empty)
2201 {
2202         struct extent_buffer *left = path->nodes[0];
2203         struct extent_buffer *right;
2204         struct extent_buffer *upper;
2205         struct btrfs_disk_key disk_key;
2206         int slot;
2207         u32 i;
2208         int free_space;
2209         int push_space = 0;
2210         int push_items = 0;
2211         struct btrfs_item *item;
2212         u32 left_nritems;
2213         u32 nr;
2214         u32 right_nritems;
2215         u32 data_end;
2216         u32 this_item_size;
2217         int ret;
2218
2219         slot = path->slots[1];
2220         if (!path->nodes[1]) {
2221                 return 1;
2222         }
2223         upper = path->nodes[1];
2224         if (slot >= btrfs_header_nritems(upper) - 1)
2225                 return 1;
2226
2227         WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2228
2229         right = read_node_slot(root, upper, slot + 1);
2230         btrfs_tree_lock(right);
2231         free_space = btrfs_leaf_free_space(root, right);
2232         if (free_space < data_size + sizeof(struct btrfs_item))
2233                 goto out_unlock;
2234
2235         /* cow and double check */
2236         ret = btrfs_cow_block(trans, root, right, upper,
2237                               slot + 1, &right, 0);
2238         if (ret)
2239                 goto out_unlock;
2240
2241         free_space = btrfs_leaf_free_space(root, right);
2242         if (free_space < data_size + sizeof(struct btrfs_item))
2243                 goto out_unlock;
2244
2245         left_nritems = btrfs_header_nritems(left);
2246         if (left_nritems == 0)
2247                 goto out_unlock;
2248
2249         if (empty)
2250                 nr = 0;
2251         else
2252                 nr = 1;
2253
2254         if (path->slots[0] >= left_nritems)
2255                 push_space += data_size + sizeof(*item);
2256
2257         i = left_nritems - 1;
2258         while (i >= nr) {
2259                 item = btrfs_item_nr(left, i);
2260
2261                 if (!empty && push_items > 0) {
2262                         if (path->slots[0] > i)
2263                                 break;
2264                         if (path->slots[0] == i) {
2265                                 int space = btrfs_leaf_free_space(root, left);
2266                                 if (space + push_space * 2 > free_space)
2267                                         break;
2268                         }
2269                 }
2270
2271                 if (path->slots[0] == i)
2272                         push_space += data_size + sizeof(*item);
2273
2274                 if (!left->map_token) {
2275                         map_extent_buffer(left, (unsigned long)item,
2276                                         sizeof(struct btrfs_item),
2277                                         &left->map_token, &left->kaddr,
2278                                         &left->map_start, &left->map_len,
2279                                         KM_USER1);
2280                 }
2281
2282                 this_item_size = btrfs_item_size(left, item);
2283                 if (this_item_size + sizeof(*item) + push_space > free_space)
2284                         break;
2285
2286                 push_items++;
2287                 push_space += this_item_size + sizeof(*item);
2288                 if (i == 0)
2289                         break;
2290                 i--;
2291         }
2292         if (left->map_token) {
2293                 unmap_extent_buffer(left, left->map_token, KM_USER1);
2294                 left->map_token = NULL;
2295         }
2296
2297         if (push_items == 0)
2298                 goto out_unlock;
2299
2300         if (!empty && push_items == left_nritems)
2301                 WARN_ON(1);
2302
2303         /* push left to right */
2304         right_nritems = btrfs_header_nritems(right);
2305
2306         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2307         push_space -= leaf_data_end(root, left);
2308
2309         /* make room in the right data area */
2310         data_end = leaf_data_end(root, right);
2311         memmove_extent_buffer(right,
2312                               btrfs_leaf_data(right) + data_end - push_space,
2313                               btrfs_leaf_data(right) + data_end,
2314                               BTRFS_LEAF_DATA_SIZE(root) - data_end);
2315
2316         /* copy from the left data area */
2317         copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2318                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
2319                      btrfs_leaf_data(left) + leaf_data_end(root, left),
2320                      push_space);
2321
2322         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2323                               btrfs_item_nr_offset(0),
2324                               right_nritems * sizeof(struct btrfs_item));
2325
2326         /* copy the items from left to right */
2327         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2328                    btrfs_item_nr_offset(left_nritems - push_items),
2329                    push_items * sizeof(struct btrfs_item));
2330
2331         /* update the item pointers */
2332         right_nritems += push_items;
2333         btrfs_set_header_nritems(right, right_nritems);
2334         push_space = BTRFS_LEAF_DATA_SIZE(root);
2335         for (i = 0; i < right_nritems; i++) {
2336                 item = btrfs_item_nr(right, i);
2337                 if (!right->map_token) {
2338                         map_extent_buffer(right, (unsigned long)item,
2339                                         sizeof(struct btrfs_item),
2340                                         &right->map_token, &right->kaddr,
2341                                         &right->map_start, &right->map_len,
2342                                         KM_USER1);
2343                 }
2344                 push_space -= btrfs_item_size(right, item);
2345                 btrfs_set_item_offset(right, item, push_space);
2346         }
2347
2348         if (right->map_token) {
2349                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2350                 right->map_token = NULL;
2351         }
2352         left_nritems -= push_items;
2353         btrfs_set_header_nritems(left, left_nritems);
2354
2355         if (left_nritems)
2356                 btrfs_mark_buffer_dirty(left);
2357         btrfs_mark_buffer_dirty(right);
2358
2359         ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
2360         BUG_ON(ret);
2361
2362         btrfs_item_key(right, &disk_key, 0);
2363         btrfs_set_node_key(upper, &disk_key, slot + 1);
2364         btrfs_mark_buffer_dirty(upper);
2365
2366         /* then fixup the leaf pointer in the path */
2367         if (path->slots[0] >= left_nritems) {
2368                 path->slots[0] -= left_nritems;
2369                 if (btrfs_header_nritems(path->nodes[0]) == 0)
2370                         clean_tree_block(trans, root, path->nodes[0]);
2371                 btrfs_tree_unlock(path->nodes[0]);
2372                 free_extent_buffer(path->nodes[0]);
2373                 path->nodes[0] = right;
2374                 path->slots[1] += 1;
2375         } else {
2376                 btrfs_tree_unlock(right);
2377                 free_extent_buffer(right);
2378         }
2379         return 0;
2380
2381 out_unlock:
2382         btrfs_tree_unlock(right);
2383         free_extent_buffer(right);
2384         return 1;
2385 }
2386
2387 /*
2388  * push some data in the path leaf to the left, trying to free up at
2389  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2390  */
2391 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2392                           *root, struct btrfs_path *path, int data_size,
2393                           int empty)
2394 {
2395         struct btrfs_disk_key disk_key;
2396         struct extent_buffer *right = path->nodes[0];
2397         struct extent_buffer *left;
2398         int slot;
2399         int i;
2400         int free_space;
2401         int push_space = 0;
2402         int push_items = 0;
2403         struct btrfs_item *item;
2404         u32 old_left_nritems;
2405         u32 right_nritems;
2406         u32 nr;
2407         int ret = 0;
2408         int wret;
2409         u32 this_item_size;
2410         u32 old_left_item_size;
2411
2412         slot = path->slots[1];
2413         if (slot == 0)
2414                 return 1;
2415         if (!path->nodes[1])
2416                 return 1;
2417
2418         right_nritems = btrfs_header_nritems(right);
2419         if (right_nritems == 0) {
2420                 return 1;
2421         }
2422
2423         WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2424
2425         left = read_node_slot(root, path->nodes[1], slot - 1);
2426         btrfs_tree_lock(left);
2427         free_space = btrfs_leaf_free_space(root, left);
2428         if (free_space < data_size + sizeof(struct btrfs_item)) {
2429                 ret = 1;
2430                 goto out;
2431         }
2432
2433         /* cow and double check */
2434         ret = btrfs_cow_block(trans, root, left,
2435                               path->nodes[1], slot - 1, &left, 0);
2436         if (ret) {
2437                 /* we hit -ENOSPC, but it isn't fatal here */
2438                 ret = 1;
2439                 goto out;
2440         }
2441
2442         free_space = btrfs_leaf_free_space(root, left);
2443         if (free_space < data_size + sizeof(struct btrfs_item)) {
2444                 ret = 1;
2445                 goto out;
2446         }
2447
2448         if (empty)
2449                 nr = right_nritems;
2450         else
2451                 nr = right_nritems - 1;
2452
2453         for (i = 0; i < nr; i++) {
2454                 item = btrfs_item_nr(right, i);
2455                 if (!right->map_token) {
2456                         map_extent_buffer(right, (unsigned long)item,
2457                                         sizeof(struct btrfs_item),
2458                                         &right->map_token, &right->kaddr,
2459                                         &right->map_start, &right->map_len,
2460                                         KM_USER1);
2461                 }
2462
2463                 if (!empty && push_items > 0) {
2464                         if (path->slots[0] < i)
2465                                 break;
2466                         if (path->slots[0] == i) {
2467                                 int space = btrfs_leaf_free_space(root, right);
2468                                 if (space + push_space * 2 > free_space)
2469                                         break;
2470                         }
2471                 }
2472
2473                 if (path->slots[0] == i)
2474                         push_space += data_size + sizeof(*item);
2475
2476                 this_item_size = btrfs_item_size(right, item);
2477                 if (this_item_size + sizeof(*item) + push_space > free_space)
2478                         break;
2479
2480                 push_items++;
2481                 push_space += this_item_size + sizeof(*item);
2482         }
2483
2484         if (right->map_token) {
2485                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2486                 right->map_token = NULL;
2487         }
2488
2489         if (push_items == 0) {
2490                 ret = 1;
2491                 goto out;
2492         }
2493         if (!empty && push_items == btrfs_header_nritems(right))
2494                 WARN_ON(1);
2495
2496         /* push data from right to left */
2497         copy_extent_buffer(left, right,
2498                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
2499                            btrfs_item_nr_offset(0),
2500                            push_items * sizeof(struct btrfs_item));
2501
2502         push_space = BTRFS_LEAF_DATA_SIZE(root) -
2503                      btrfs_item_offset_nr(right, push_items -1);
2504
2505         copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2506                      leaf_data_end(root, left) - push_space,
2507                      btrfs_leaf_data(right) +
2508                      btrfs_item_offset_nr(right, push_items - 1),
2509                      push_space);
2510         old_left_nritems = btrfs_header_nritems(left);
2511         BUG_ON(old_left_nritems < 0);
2512
2513         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2514         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2515                 u32 ioff;
2516
2517                 item = btrfs_item_nr(left, i);
2518                 if (!left->map_token) {
2519                         map_extent_buffer(left, (unsigned long)item,
2520                                         sizeof(struct btrfs_item),
2521                                         &left->map_token, &left->kaddr,
2522                                         &left->map_start, &left->map_len,
2523                                         KM_USER1);
2524                 }
2525
2526                 ioff = btrfs_item_offset(left, item);
2527                 btrfs_set_item_offset(left, item,
2528                       ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2529         }
2530         btrfs_set_header_nritems(left, old_left_nritems + push_items);
2531         if (left->map_token) {
2532                 unmap_extent_buffer(left, left->map_token, KM_USER1);
2533                 left->map_token = NULL;
2534         }
2535
2536         /* fixup right node */
2537         if (push_items > right_nritems) {
2538                 printk("push items %d nr %u\n", push_items, right_nritems);
2539                 WARN_ON(1);
2540         }
2541
2542         if (push_items < right_nritems) {
2543                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2544                                                   leaf_data_end(root, right);
2545                 memmove_extent_buffer(right, btrfs_leaf_data(right) +
2546                                       BTRFS_LEAF_DATA_SIZE(root) - push_space,
2547                                       btrfs_leaf_data(right) +
2548                                       leaf_data_end(root, right), push_space);
2549
2550                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2551                               btrfs_item_nr_offset(push_items),
2552                              (btrfs_header_nritems(right) - push_items) *
2553                              sizeof(struct btrfs_item));
2554         }
2555         right_nritems -= push_items;
2556         btrfs_set_header_nritems(right, right_nritems);
2557         push_space = BTRFS_LEAF_DATA_SIZE(root);
2558         for (i = 0; i < right_nritems; i++) {
2559                 item = btrfs_item_nr(right, i);
2560
2561                 if (!right->map_token) {
2562                         map_extent_buffer(right, (unsigned long)item,
2563                                         sizeof(struct btrfs_item),
2564                                         &right->map_token, &right->kaddr,
2565                                         &right->map_start, &right->map_len,
2566                                         KM_USER1);
2567                 }
2568
2569                 push_space = push_space - btrfs_item_size(right, item);
2570                 btrfs_set_item_offset(right, item, push_space);
2571         }
2572         if (right->map_token) {
2573                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2574                 right->map_token = NULL;
2575         }
2576
2577         btrfs_mark_buffer_dirty(left);
2578         if (right_nritems)
2579                 btrfs_mark_buffer_dirty(right);
2580
2581         ret = btrfs_update_ref(trans, root, right, left,
2582                                old_left_nritems, push_items);
2583         BUG_ON(ret);
2584
2585         btrfs_item_key(right, &disk_key, 0);
2586         wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2587         if (wret)
2588                 ret = wret;
2589
2590         /* then fixup the leaf pointer in the path */
2591         if (path->slots[0] < push_items) {
2592                 path->slots[0] += old_left_nritems;
2593                 if (btrfs_header_nritems(path->nodes[0]) == 0)
2594                         clean_tree_block(trans, root, path->nodes[0]);
2595                 btrfs_tree_unlock(path->nodes[0]);
2596                 free_extent_buffer(path->nodes[0]);
2597                 path->nodes[0] = left;
2598                 path->slots[1] -= 1;
2599         } else {
2600                 btrfs_tree_unlock(left);
2601                 free_extent_buffer(left);
2602                 path->slots[0] -= push_items;
2603         }
2604         BUG_ON(path->slots[0] < 0);
2605         return ret;
2606 out:
2607         btrfs_tree_unlock(left);
2608         free_extent_buffer(left);
2609         return ret;
2610 }
2611
2612 /*
2613  * split the path's leaf in two, making sure there is at least data_size
2614  * available for the resulting leaf level of the path.
2615  *
2616  * returns 0 if all went well and < 0 on failure.
2617  */
2618 static noinline int split_leaf(struct btrfs_trans_handle *trans,
2619                                struct btrfs_root *root,
2620                                struct btrfs_key *ins_key,
2621                                struct btrfs_path *path, int data_size,
2622                                int extend)
2623 {
2624         struct extent_buffer *l;
2625         u32 nritems;
2626         int mid;
2627         int slot;
2628         struct extent_buffer *right;
2629         int space_needed = data_size + sizeof(struct btrfs_item);
2630         int data_copy_size;
2631         int rt_data_off;
2632         int i;
2633         int ret = 0;
2634         int wret;
2635         int double_split;
2636         int num_doubles = 0;
2637         struct btrfs_disk_key disk_key;
2638
2639         if (extend)
2640                 space_needed = data_size;
2641
2642         /* first try to make some room by pushing left and right */
2643         if (ins_key->type != BTRFS_DIR_ITEM_KEY) {
2644                 wret = push_leaf_right(trans, root, path, data_size, 0);
2645                 if (wret < 0) {
2646                         return wret;
2647                 }
2648                 if (wret) {
2649                         wret = push_leaf_left(trans, root, path, data_size, 0);
2650                         if (wret < 0)
2651                                 return wret;
2652                 }
2653                 l = path->nodes[0];
2654
2655                 /* did the pushes work? */
2656                 if (btrfs_leaf_free_space(root, l) >= space_needed)
2657                         return 0;
2658         }
2659
2660         if (!path->nodes[1]) {
2661                 ret = insert_new_root(trans, root, path, 1);
2662                 if (ret)
2663                         return ret;
2664         }
2665 again:
2666         double_split = 0;
2667         l = path->nodes[0];
2668         slot = path->slots[0];
2669         nritems = btrfs_header_nritems(l);
2670         mid = (nritems + 1)/ 2;
2671
2672         right = btrfs_alloc_free_block(trans, root, root->leafsize,
2673                                         path->nodes[1]->start,
2674                                         root->root_key.objectid,
2675                                         trans->transid, 0, l->start, 0);
2676         if (IS_ERR(right)) {
2677                 BUG_ON(1);
2678                 return PTR_ERR(right);
2679         }
2680
2681         memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2682         btrfs_set_header_bytenr(right, right->start);
2683         btrfs_set_header_generation(right, trans->transid);
2684         btrfs_set_header_owner(right, root->root_key.objectid);
2685         btrfs_set_header_level(right, 0);
2686         write_extent_buffer(right, root->fs_info->fsid,
2687                             (unsigned long)btrfs_header_fsid(right),
2688                             BTRFS_FSID_SIZE);
2689
2690         write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2691                             (unsigned long)btrfs_header_chunk_tree_uuid(right),
2692                             BTRFS_UUID_SIZE);
2693         if (mid <= slot) {
2694                 if (nritems == 1 ||
2695                     leaf_space_used(l, mid, nritems - mid) + space_needed >
2696                         BTRFS_LEAF_DATA_SIZE(root)) {
2697                         if (slot >= nritems) {
2698                                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2699                                 btrfs_set_header_nritems(right, 0);
2700                                 wret = insert_ptr(trans, root, path,
2701                                                   &disk_key, right->start,
2702                                                   path->slots[1] + 1, 1);
2703                                 if (wret)
2704                                         ret = wret;
2705
2706                                 btrfs_tree_unlock(path->nodes[0]);
2707                                 free_extent_buffer(path->nodes[0]);
2708                                 path->nodes[0] = right;
2709                                 path->slots[0] = 0;
2710                                 path->slots[1] += 1;
2711                                 btrfs_mark_buffer_dirty(right);
2712                                 return ret;
2713                         }
2714                         mid = slot;
2715                         if (mid != nritems &&
2716                             leaf_space_used(l, mid, nritems - mid) +
2717                             space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
2718                                 double_split = 1;
2719                         }
2720                 }
2721         } else {
2722                 if (leaf_space_used(l, 0, mid + 1) + space_needed >
2723                         BTRFS_LEAF_DATA_SIZE(root)) {
2724                         if (!extend && slot == 0) {
2725                                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2726                                 btrfs_set_header_nritems(right, 0);
2727                                 wret = insert_ptr(trans, root, path,
2728                                                   &disk_key,
2729                                                   right->start,
2730                                                   path->slots[1], 1);
2731                                 if (wret)
2732                                         ret = wret;
2733                                 btrfs_tree_unlock(path->nodes[0]);
2734                                 free_extent_buffer(path->nodes[0]);
2735                                 path->nodes[0] = right;
2736                                 path->slots[0] = 0;
2737                                 if (path->slots[1] == 0) {
2738                                         wret = fixup_low_keys(trans, root,
2739                                                    path, &disk_key, 1);
2740                                         if (wret)
2741                                                 ret = wret;
2742                                 }
2743                                 btrfs_mark_buffer_dirty(right);
2744                                 return ret;
2745                         } else if (extend && slot == 0) {
2746                                 mid = 1;
2747                         } else {
2748                                 mid = slot;
2749                                 if (mid != nritems &&
2750                                     leaf_space_used(l, mid, nritems - mid) +
2751                                     space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
2752                                         double_split = 1;
2753                                 }
2754                         }
2755                 }
2756         }
2757         nritems = nritems - mid;
2758         btrfs_set_header_nritems(right, nritems);
2759         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2760
2761         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2762                            btrfs_item_nr_offset(mid),
2763                            nritems * sizeof(struct btrfs_item));
2764
2765         copy_extent_buffer(right, l,
2766                      btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2767                      data_copy_size, btrfs_leaf_data(l) +
2768                      leaf_data_end(root, l), data_copy_size);
2769
2770         rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2771                       btrfs_item_end_nr(l, mid);
2772
2773         for (i = 0; i < nritems; i++) {
2774                 struct btrfs_item *item = btrfs_item_nr(right, i);
2775                 u32 ioff;
2776
2777                 if (!right->map_token) {
2778                         map_extent_buffer(right, (unsigned long)item,
2779                                         sizeof(struct btrfs_item),
2780                                         &right->map_token, &right->kaddr,
2781                                         &right->map_start, &right->map_len,
2782                                         KM_USER1);
2783                 }
2784
2785                 ioff = btrfs_item_offset(right, item);
2786                 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2787         }
2788
2789         if (right->map_token) {
2790                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2791                 right->map_token = NULL;
2792         }
2793
2794         btrfs_set_header_nritems(l, mid);
2795         ret = 0;
2796         btrfs_item_key(right, &disk_key, 0);
2797         wret = insert_ptr(trans, root, path, &disk_key, right->start,
2798                           path->slots[1] + 1, 1);
2799         if (wret)
2800                 ret = wret;
2801
2802         btrfs_mark_buffer_dirty(right);
2803         btrfs_mark_buffer_dirty(l);
2804         BUG_ON(path->slots[0] != slot);
2805
2806         ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2807         BUG_ON(ret);
2808
2809         if (mid <= slot) {
2810                 btrfs_tree_unlock(path->nodes[0]);
2811                 free_extent_buffer(path->nodes[0]);
2812                 path->nodes[0] = right;
2813                 path->slots[0] -= mid;
2814                 path->slots[1] += 1;
2815         } else {
2816                 btrfs_tree_unlock(right);
2817                 free_extent_buffer(right);
2818         }
2819
2820         BUG_ON(path->slots[0] < 0);
2821
2822         if (double_split) {
2823                 BUG_ON(num_doubles != 0);
2824                 num_doubles++;
2825                 goto again;
2826         }
2827         return ret;
2828 }
2829
2830 /*
2831  * make the item pointed to by the path smaller.  new_size indicates
2832  * how small to make it, and from_end tells us if we just chop bytes
2833  * off the end of the item or if we shift the item to chop bytes off
2834  * the front.
2835  */
2836 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
2837                         struct btrfs_root *root,
2838                         struct btrfs_path *path,
2839                         u32 new_size, int from_end)
2840 {
2841         int ret = 0;
2842         int slot;
2843         int slot_orig;
2844         struct extent_buffer *leaf;
2845         struct btrfs_item *item;
2846         u32 nritems;
2847         unsigned int data_end;
2848         unsigned int old_data_start;
2849         unsigned int old_size;
2850         unsigned int size_diff;
2851         int i;
2852
2853         slot_orig = path->slots[0];
2854         leaf = path->nodes[0];
2855         slot = path->slots[0];
2856
2857         old_size = btrfs_item_size_nr(leaf, slot);
2858         if (old_size == new_size)
2859                 return 0;
2860
2861         nritems = btrfs_header_nritems(leaf);
2862         data_end = leaf_data_end(root, leaf);
2863
2864         old_data_start = btrfs_item_offset_nr(leaf, slot);
2865
2866         size_diff = old_size - new_size;
2867
2868         BUG_ON(slot < 0);
2869         BUG_ON(slot >= nritems);
2870
2871         /*
2872          * item0..itemN ... dataN.offset..dataN.size .. data0.size
2873          */
2874         /* first correct the data pointers */
2875         for (i = slot; i < nritems; i++) {
2876                 u32 ioff;
2877                 item = btrfs_item_nr(leaf, i);
2878
2879                 if (!leaf->map_token) {
2880                         map_extent_buffer(leaf, (unsigned long)item,
2881                                         sizeof(struct btrfs_item),
2882                                         &leaf->map_token, &leaf->kaddr,
2883                                         &leaf->map_start, &leaf->map_len,
2884                                         KM_USER1);
2885                 }
2886
2887                 ioff = btrfs_item_offset(leaf, item);
2888                 btrfs_set_item_offset(leaf, item, ioff + size_diff);
2889         }
2890
2891         if (leaf->map_token) {
2892                 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2893                 leaf->map_token = NULL;
2894         }
2895
2896         /* shift the data */
2897         if (from_end) {
2898                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2899                               data_end + size_diff, btrfs_leaf_data(leaf) +
2900                               data_end, old_data_start + new_size - data_end);
2901         } else {
2902                 struct btrfs_disk_key disk_key;
2903                 u64 offset;
2904
2905                 btrfs_item_key(leaf, &disk_key, slot);
2906
2907                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
2908                         unsigned long ptr;
2909                         struct btrfs_file_extent_item *fi;
2910
2911                         fi = btrfs_item_ptr(leaf, slot,
2912                                             struct btrfs_file_extent_item);
2913                         fi = (struct btrfs_file_extent_item *)(
2914                              (unsigned long)fi - size_diff);
2915
2916                         if (btrfs_file_extent_type(leaf, fi) ==
2917                             BTRFS_FILE_EXTENT_INLINE) {
2918                                 ptr = btrfs_item_ptr_offset(leaf, slot);
2919                                 memmove_extent_buffer(leaf, ptr,
2920                                         (unsigned long)fi,
2921                                         offsetof(struct btrfs_file_extent_item,
2922                                                  disk_bytenr));
2923                         }
2924                 }
2925
2926                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2927                               data_end + size_diff, btrfs_leaf_data(leaf) +
2928                               data_end, old_data_start - data_end);
2929
2930                 offset = btrfs_disk_key_offset(&disk_key);
2931                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
2932                 btrfs_set_item_key(leaf, &disk_key, slot);
2933                 if (slot == 0)
2934                         fixup_low_keys(trans, root, path, &disk_key, 1);
2935         }
2936
2937         item = btrfs_item_nr(leaf, slot);
2938         btrfs_set_item_size(leaf, item, new_size);
2939         btrfs_mark_buffer_dirty(leaf);
2940
2941         ret = 0;
2942         if (btrfs_leaf_free_space(root, leaf) < 0) {
2943                 btrfs_print_leaf(root, leaf);
2944                 BUG();
2945         }
2946         return ret;
2947 }
2948
2949 /*
2950  * make the item pointed to by the path bigger, data_size is the new size.
2951  */
2952 int btrfs_extend_item(struct btrfs_trans_handle *trans,
2953                       struct btrfs_root *root, struct btrfs_path *path,
2954                       u32 data_size)
2955 {
2956         int ret = 0;
2957         int slot;
2958         int slot_orig;
2959         struct extent_buffer *leaf;
2960         struct btrfs_item *item;
2961         u32 nritems;
2962         unsigned int data_end;
2963         unsigned int old_data;
2964         unsigned int old_size;
2965         int i;
2966
2967         slot_orig = path->slots[0];
2968         leaf = path->nodes[0];
2969
2970         nritems = btrfs_header_nritems(leaf);
2971         data_end = leaf_data_end(root, leaf);
2972
2973         if (btrfs_leaf_free_space(root, leaf) < data_size) {
2974                 btrfs_print_leaf(root, leaf);
2975                 BUG();
2976         }
2977         slot = path->slots[0];
2978         old_data = btrfs_item_end_nr(leaf, slot);
2979
2980         BUG_ON(slot < 0);
2981         if (slot >= nritems) {
2982                 btrfs_print_leaf(root, leaf);
2983                 printk("slot %d too large, nritems %d\n", slot, nritems);
2984                 BUG_ON(1);
2985         }
2986
2987         /*
2988          * item0..itemN ... dataN.offset..dataN.size .. data0.size
2989          */
2990         /* first correct the data pointers */
2991         for (i = slot; i < nritems; i++) {
2992                 u32 ioff;
2993                 item = btrfs_item_nr(leaf, i);
2994
2995                 if (!leaf->map_token) {
2996                         map_extent_buffer(leaf, (unsigned long)item,
2997                                         sizeof(struct btrfs_item),
2998                                         &leaf->map_token, &leaf->kaddr,
2999                                         &leaf->map_start, &leaf->map_len,
3000                                         KM_USER1);
3001                 }
3002                 ioff = btrfs_item_offset(leaf, item);
3003                 btrfs_set_item_offset(leaf, item, ioff - data_size);
3004         }
3005
3006         if (leaf->map_token) {
3007                 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3008                 leaf->map_token = NULL;
3009         }
3010
3011         /* shift the data */
3012         memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3013                       data_end - data_size, btrfs_leaf_data(leaf) +
3014                       data_end, old_data - data_end);
3015
3016         data_end = old_data;
3017         old_size = btrfs_item_size_nr(leaf, slot);
3018         item = btrfs_item_nr(leaf, slot);
3019         btrfs_set_item_size(leaf, item, old_size + data_size);
3020         btrfs_mark_buffer_dirty(leaf);
3021
3022         ret = 0;
3023         if (btrfs_leaf_free_space(root, leaf) < 0) {
3024                 btrfs_print_leaf(root, leaf);
3025                 BUG();
3026         }
3027         return ret;
3028 }
3029
3030 /*
3031  * Given a key and some data, insert items into the tree.
3032  * This does all the path init required, making room in the tree if needed.
3033  * Returns the number of keys that were inserted.
3034  */
3035 int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3036                             struct btrfs_root *root,
3037                             struct btrfs_path *path,
3038                             struct btrfs_key *cpu_key, u32 *data_size,
3039                             int nr)
3040 {
3041         struct extent_buffer *leaf;
3042         struct btrfs_item *item;
3043         int ret = 0;
3044         int slot;
3045         int i;
3046         u32 nritems;
3047         u32 total_data = 0;
3048         u32 total_size = 0;
3049         unsigned int data_end;
3050         struct btrfs_disk_key disk_key;
3051         struct btrfs_key found_key;
3052
3053         found_key.objectid = 0;
3054         nr = min_t(int, nr, BTRFS_NODEPTRS_PER_BLOCK(root));
3055
3056         for (i = 0; i < nr; i++)
3057                 total_data += data_size[i];
3058
3059         total_data = min_t(u32, total_data, BTRFS_LEAF_DATA_SIZE(root));
3060         total_size = total_data + (nr * sizeof(struct btrfs_item));
3061         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3062         if (ret == 0)
3063                 return -EEXIST;
3064         if (ret < 0)
3065                 goto out;
3066
3067         leaf = path->nodes[0];
3068
3069         nritems = btrfs_header_nritems(leaf);
3070         data_end = leaf_data_end(root, leaf);
3071
3072         if (btrfs_leaf_free_space(root, leaf) < total_size) {
3073                 for (i = nr; i >= 0; i--) {
3074                         total_data -= data_size[i];
3075                         total_size -= data_size[i] + sizeof(struct btrfs_item);
3076                         if (total_size < btrfs_leaf_free_space(root, leaf))
3077                                 break;
3078                 }
3079                 nr = i;
3080         }
3081
3082         slot = path->slots[0];
3083         BUG_ON(slot < 0);
3084
3085         if (slot != nritems) {
3086                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3087
3088                 item = btrfs_item_nr(leaf, slot);
3089                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3090
3091                 /* figure out how many keys we can insert in here */
3092                 total_data = data_size[0];
3093                 for (i = 1; i < nr; i++) {
3094                         if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3095                                 break;
3096                         total_data += data_size[i];
3097                 }
3098                 nr = i;
3099
3100                 if (old_data < data_end) {
3101                         btrfs_print_leaf(root, leaf);
3102                         printk("slot %d old_data %d data_end %d\n",
3103                                slot, old_data, data_end);
3104                         BUG_ON(1);
3105                 }
3106                 /*
3107                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
3108                  */
3109                 /* first correct the data pointers */
3110                 WARN_ON(leaf->map_token);
3111                 for (i = slot; i < nritems; i++) {
3112                         u32 ioff;
3113
3114                         item = btrfs_item_nr(leaf, i);
3115                         if (!leaf->map_token) {
3116                                 map_extent_buffer(leaf, (unsigned long)item,
3117                                         sizeof(struct btrfs_item),
3118                                         &leaf->map_token, &leaf->kaddr,
3119                                         &leaf->map_start, &leaf->map_len,
3120                                         KM_USER1);
3121                         }
3122
3123                         ioff = btrfs_item_offset(leaf, item);
3124                         btrfs_set_item_offset(leaf, item, ioff - total_data);
3125                 }
3126                 if (leaf->map_token) {
3127                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3128                         leaf->map_token = NULL;
3129                 }
3130
3131                 /* shift the items */
3132                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3133                               btrfs_item_nr_offset(slot),
3134                               (nritems - slot) * sizeof(struct btrfs_item));
3135
3136                 /* shift the data */
3137                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3138                               data_end - total_data, btrfs_leaf_data(leaf) +
3139                               data_end, old_data - data_end);
3140                 data_end = old_data;
3141         } else {
3142                 /*
3143                  * this sucks but it has to be done, if we are inserting at
3144                  * the end of the leaf only insert 1 of the items, since we
3145                  * have no way of knowing whats on the next leaf and we'd have
3146                  * to drop our current locks to figure it out
3147                  */
3148                 nr = 1;
3149         }
3150
3151         /* setup the item for the new data */
3152         for (i = 0; i < nr; i++) {
3153                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3154                 btrfs_set_item_key(leaf, &disk_key, slot + i);
3155                 item = btrfs_item_nr(leaf, slot + i);
3156                 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3157                 data_end -= data_size[i];
3158                 btrfs_set_item_size(leaf, item, data_size[i]);
3159         }
3160         btrfs_set_header_nritems(leaf, nritems + nr);
3161         btrfs_mark_buffer_dirty(leaf);
3162
3163         ret = 0;
3164         if (slot == 0) {
3165                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3166                 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3167         }
3168
3169         if (btrfs_leaf_free_space(root, leaf) < 0) {
3170                 btrfs_print_leaf(root, leaf);
3171                 BUG();
3172         }
3173 out:
3174         if (!ret)
3175                 ret = nr;
3176         return ret;
3177 }
3178
3179 /*
3180  * Given a key and some data, insert items into the tree.
3181  * This does all the path init required, making room in the tree if needed.
3182  */
3183 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3184                             struct btrfs_root *root,
3185                             struct btrfs_path *path,
3186                             struct btrfs_key *cpu_key, u32 *data_size,
3187                             int nr)
3188 {
3189         struct extent_buffer *leaf;
3190         struct btrfs_item *item;
3191         int ret = 0;
3192         int slot;
3193         int slot_orig;
3194         int i;
3195         u32 nritems;
3196         u32 total_size = 0;
3197         u32 total_data = 0;
3198         unsigned int data_end;
3199         struct btrfs_disk_key disk_key;
3200
3201         for (i = 0; i < nr; i++) {
3202                 total_data += data_size[i];
3203         }
3204
3205         total_size = total_data + (nr * sizeof(struct btrfs_item));
3206         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3207         if (ret == 0)
3208                 return -EEXIST;
3209         if (ret < 0)
3210                 goto out;
3211
3212         slot_orig = path->slots[0];
3213         leaf = path->nodes[0];
3214
3215         nritems = btrfs_header_nritems(leaf);
3216         data_end = leaf_data_end(root, leaf);
3217
3218         if (btrfs_leaf_free_space(root, leaf) < total_size) {
3219                 btrfs_print_leaf(root, leaf);
3220                 printk("not enough freespace need %u have %d\n",
3221                        total_size, btrfs_leaf_free_space(root, leaf));
3222                 BUG();
3223         }
3224
3225         slot = path->slots[0];
3226         BUG_ON(slot < 0);
3227
3228         if (slot != nritems) {
3229                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3230
3231                 if (old_data < data_end) {
3232                         btrfs_print_leaf(root, leaf);
3233                         printk("slot %d old_data %d data_end %d\n",
3234                                slot, old_data, data_end);
3235                         BUG_ON(1);
3236                 }
3237                 /*
3238                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
3239                  */
3240                 /* first correct the data pointers */
3241                 WARN_ON(leaf->map_token);
3242                 for (i = slot; i < nritems; i++) {
3243                         u32 ioff;
3244
3245                         item = btrfs_item_nr(leaf, i);
3246                         if (!leaf->map_token) {
3247                                 map_extent_buffer(leaf, (unsigned long)item,
3248                                         sizeof(struct btrfs_item),
3249                                         &leaf->map_token, &leaf->kaddr,
3250                                         &leaf->map_start, &leaf->map_len,
3251                                         KM_USER1);
3252                         }
3253
3254                         ioff = btrfs_item_offset(leaf, item);
3255                         btrfs_set_item_offset(leaf, item, ioff - total_data);
3256                 }
3257                 if (leaf->map_token) {
3258                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3259                         leaf->map_token = NULL;
3260                 }
3261
3262                 /* shift the items */
3263                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3264                               btrfs_item_nr_offset(slot),
3265                               (nritems - slot) * sizeof(struct btrfs_item));
3266
3267                 /* shift the data */
3268                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3269                               data_end - total_data, btrfs_leaf_data(leaf) +
3270                               data_end, old_data - data_end);
3271                 data_end = old_data;
3272         }
3273
3274         /* setup the item for the new data */
3275         for (i = 0; i < nr; i++) {
3276                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3277                 btrfs_set_item_key(leaf, &disk_key, slot + i);
3278                 item = btrfs_item_nr(leaf, slot + i);
3279                 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3280                 data_end -= data_size[i];
3281                 btrfs_set_item_size(leaf, item, data_size[i]);
3282         }
3283         btrfs_set_header_nritems(leaf, nritems + nr);
3284         btrfs_mark_buffer_dirty(leaf);
3285
3286         ret = 0;
3287         if (slot == 0) {
3288                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3289                 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3290         }
3291
3292         if (btrfs_leaf_free_space(root, leaf) < 0) {
3293                 btrfs_print_leaf(root, leaf);
3294                 BUG();
3295         }
3296 out:
3297         return ret;
3298 }
3299
3300 /*
3301  * Given a key and some data, insert an item into the tree.
3302  * This does all the path init required, making room in the tree if needed.
3303  */
3304 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3305                       *root, struct btrfs_key *cpu_key, void *data, u32
3306                       data_size)
3307 {
3308         int ret = 0;
3309         struct btrfs_path *path;
3310         struct extent_buffer *leaf;
3311         unsigned long ptr;
3312
3313         path = btrfs_alloc_path();
3314         BUG_ON(!path);
3315         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3316         if (!ret) {
3317                 leaf = path->nodes[0];
3318                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3319                 write_extent_buffer(leaf, data, ptr, data_size);
3320                 btrfs_mark_buffer_dirty(leaf);
3321         }
3322         btrfs_free_path(path);
3323         return ret;
3324 }
3325
3326 /*
3327  * delete the pointer from a given node.
3328  *
3329  * the tree should have been previously balanced so the deletion does not
3330  * empty a node.
3331  */
3332 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3333                    struct btrfs_path *path, int level, int slot)
3334 {
3335         struct extent_buffer *parent = path->nodes[level];
3336         u32 nritems;
3337         int ret = 0;
3338         int wret;
3339
3340         nritems = btrfs_header_nritems(parent);
3341         if (slot != nritems -1) {
3342                 memmove_extent_buffer(parent,
3343                               btrfs_node_key_ptr_offset(slot),
3344                               btrfs_node_key_ptr_offset(slot + 1),
3345                               sizeof(struct btrfs_key_ptr) *
3346                               (nritems - slot - 1));
3347         }
3348         nritems--;
3349         btrfs_set_header_nritems(parent, nritems);
3350         if (nritems == 0 && parent == root->node) {
3351                 BUG_ON(btrfs_header_level(root->node) != 1);
3352                 /* just turn the root into a leaf and break */
3353                 btrfs_set_header_level(root->node, 0);
3354         } else if (slot == 0) {
3355                 struct btrfs_disk_key disk_key;
3356
3357                 btrfs_node_key(parent, &disk_key, 0);
3358                 wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
3359                 if (wret)
3360                         ret = wret;
3361         }
3362         btrfs_mark_buffer_dirty(parent);
3363         return ret;
3364 }
3365
3366 /*
3367  * a helper function to delete the leaf pointed to by path->slots[1] and
3368  * path->nodes[1].  bytenr is the node block pointer, but since the callers
3369  * already know it, it is faster to have them pass it down than to
3370  * read it out of the node again.
3371  *
3372  * This deletes the pointer in path->nodes[1] and frees the leaf
3373  * block extent.  zero is returned if it all worked out, < 0 otherwise.
3374  *
3375  * The path must have already been setup for deleting the leaf, including
3376  * all the proper balancing.  path->nodes[1] must be locked.
3377  */
3378 noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3379                             struct btrfs_root *root,
3380                             struct btrfs_path *path, u64 bytenr)
3381 {
3382         int ret;
3383         u64 root_gen = btrfs_header_generation(path->nodes[1]);
3384
3385         ret = del_ptr(trans, root, path, 1, path->slots[1]);
3386         if (ret)
3387                 return ret;
3388
3389         ret = btrfs_free_extent(trans, root, bytenr,
3390                                 btrfs_level_size(root, 0),
3391                                 path->nodes[1]->start,
3392                                 btrfs_header_owner(path->nodes[1]),
3393                                 root_gen, 0, 1);
3394         return ret;
3395 }
3396 /*
3397  * delete the item at the leaf level in path.  If that empties
3398  * the leaf, remove it from the tree
3399  */
3400 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3401                     struct btrfs_path *path, int slot, int nr)
3402 {
3403         struct extent_buffer *leaf;
3404         struct btrfs_item *item;
3405         int last_off;
3406         int dsize = 0;
3407         int ret = 0;
3408         int wret;
3409         int i;
3410         u32 nritems;
3411
3412         leaf = path->nodes[0];
3413         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3414
3415         for (i = 0; i < nr; i++)
3416                 dsize += btrfs_item_size_nr(leaf, slot + i);
3417
3418         nritems = btrfs_header_nritems(leaf);
3419
3420         if (slot + nr != nritems) {
3421                 int data_end = leaf_data_end(root, leaf);
3422
3423                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3424                               data_end + dsize,
3425                               btrfs_leaf_data(leaf) + data_end,
3426                               last_off - data_end);
3427
3428                 for (i = slot + nr; i < nritems; i++) {
3429                         u32 ioff;
3430
3431                         item = btrfs_item_nr(leaf, i);
3432                         if (!leaf->map_token) {
3433                                 map_extent_buffer(leaf, (unsigned long)item,
3434                                         sizeof(struct btrfs_item),
3435                                         &leaf->map_token, &leaf->kaddr,
3436                                         &leaf->map_start, &leaf->map_len,
3437                                         KM_USER1);
3438                         }
3439                         ioff = btrfs_item_offset(leaf, item);
3440                         btrfs_set_item_offset(leaf, item, ioff + dsize);
3441                 }
3442
3443                 if (leaf->map_token) {
3444                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3445                         leaf->map_token = NULL;
3446                 }
3447
3448                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3449                               btrfs_item_nr_offset(slot + nr),
3450                               sizeof(struct btrfs_item) *
3451                               (nritems - slot - nr));
3452         }
3453         btrfs_set_header_nritems(leaf, nritems - nr);
3454         nritems -= nr;
3455
3456         /* delete the leaf if we've emptied it */
3457         if (nritems == 0) {
3458                 if (leaf == root->node) {
3459                         btrfs_set_header_level(leaf, 0);
3460                 } else {
3461                         ret = btrfs_del_leaf(trans, root, path, leaf->start);
3462                         BUG_ON(ret);
3463                 }
3464         } else {
3465                 int used = leaf_space_used(leaf, 0, nritems);
3466                 if (slot == 0) {
3467                         struct btrfs_disk_key disk_key;
3468
3469                         btrfs_item_key(leaf, &disk_key, 0);
3470                         wret = fixup_low_keys(trans, root, path,
3471                                               &disk_key, 1);
3472                         if (wret)
3473                                 ret = wret;
3474                 }
3475
3476                 /* delete the leaf if it is mostly empty */
3477                 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
3478                         /* push_leaf_left fixes the path.
3479                          * make sure the path still points to our leaf
3480                          * for possible call to del_ptr below
3481                          */
3482                         slot = path->slots[1];
3483                         extent_buffer_get(leaf);
3484
3485                         wret = push_leaf_left(trans, root, path, 1, 1);
3486                         if (wret < 0 && wret != -ENOSPC)
3487                                 ret = wret;
3488
3489                         if (path->nodes[0] == leaf &&
3490                             btrfs_header_nritems(leaf)) {
3491                                 wret = push_leaf_right(trans, root, path, 1, 1);
3492                                 if (wret < 0 && wret != -ENOSPC)
3493                                         ret = wret;
3494                         }
3495
3496                         if (btrfs_header_nritems(leaf) == 0) {
3497                                 path->slots[1] = slot;
3498                                 ret = btrfs_del_leaf(trans, root, path, leaf->start);
3499                                 BUG_ON(ret);
3500                                 free_extent_buffer(leaf);
3501                         } else {
3502                                 /* if we're still in the path, make sure
3503                                  * we're dirty.  Otherwise, one of the
3504                                  * push_leaf functions must have already
3505                                  * dirtied this buffer
3506                                  */
3507                                 if (path->nodes[0] == leaf)
3508                                         btrfs_mark_buffer_dirty(leaf);
3509                                 free_extent_buffer(leaf);
3510                         }
3511                 } else {
3512                         btrfs_mark_buffer_dirty(leaf);
3513                 }
3514         }
3515         return ret;
3516 }
3517
3518 /*
3519  * search the tree again to find a leaf with lesser keys
3520  * returns 0 if it found something or 1 if there are no lesser leaves.
3521  * returns < 0 on io errors.
3522  *
3523  * This may release the path, and so you may lose any locks held at the
3524  * time you call it.
3525  */
3526 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3527 {
3528         struct btrfs_key key;
3529         struct btrfs_disk_key found_key;
3530         int ret;
3531
3532         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3533
3534         if (key.offset > 0)
3535                 key.offset--;
3536         else if (key.type > 0)
3537                 key.type--;
3538         else if (key.objectid > 0)
3539                 key.objectid--;
3540         else
3541                 return 1;
3542
3543         btrfs_release_path(root, path);
3544         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3545         if (ret < 0)
3546                 return ret;
3547         btrfs_item_key(path->nodes[0], &found_key, 0);
3548         ret = comp_keys(&found_key, &key);
3549         if (ret < 0)
3550                 return 0;
3551         return 1;
3552 }
3553
3554 /*
3555  * A helper function to walk down the tree starting at min_key, and looking
3556  * for nodes or leaves that are either in cache or have a minimum
3557  * transaction id.  This is used by the btree defrag code, and tree logging
3558  *
3559  * This does not cow, but it does stuff the starting key it finds back
3560  * into min_key, so you can call btrfs_search_slot with cow=1 on the
3561  * key and get a writable path.
3562  *
3563  * This does lock as it descends, and path->keep_locks should be set
3564  * to 1 by the caller.
3565  *
3566  * This honors path->lowest_level to prevent descent past a given level
3567  * of the tree.
3568  *
3569  * min_trans indicates the oldest transaction that you are interested
3570  * in walking through.  Any nodes or leaves older than min_trans are
3571  * skipped over (without reading them).
3572  *
3573  * returns zero if something useful was found, < 0 on error and 1 if there
3574  * was nothing in the tree that matched the search criteria.
3575  */
3576 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3577                          struct btrfs_key *max_key,
3578                          struct btrfs_path *path, int cache_only,
3579                          u64 min_trans)
3580 {
3581         struct extent_buffer *cur;
3582         struct btrfs_key found_key;
3583         int slot;
3584         int sret;
3585         u32 nritems;
3586         int level;
3587         int ret = 1;
3588
3589         WARN_ON(!path->keep_locks);
3590 again:
3591         cur = btrfs_lock_root_node(root);
3592         level = btrfs_header_level(cur);
3593         WARN_ON(path->nodes[level]);
3594         path->nodes[level] = cur;
3595         path->locks[level] = 1;
3596
3597         if (btrfs_header_generation(cur) < min_trans) {
3598                 ret = 1;
3599                 goto out;
3600         }
3601         while(1) {
3602                 nritems = btrfs_header_nritems(cur);
3603                 level = btrfs_header_level(cur);
3604                 sret = bin_search(cur, min_key, level, &slot);
3605
3606                 /* at the lowest level, we're done, setup the path and exit */
3607                 if (level == path->lowest_level) {
3608                         if (slot >= nritems)
3609                                 goto find_next_key;
3610                         ret = 0;
3611                         path->slots[level] = slot;
3612                         btrfs_item_key_to_cpu(cur, &found_key, slot);
3613                         goto out;
3614                 }
3615                 if (sret && slot > 0)
3616                         slot--;
3617                 /*
3618                  * check this node pointer against the cache_only and
3619                  * min_trans parameters.  If it isn't in cache or is too
3620                  * old, skip to the next one.
3621                  */
3622                 while(slot < nritems) {
3623                         u64 blockptr;
3624                         u64 gen;
3625                         struct extent_buffer *tmp;
3626                         struct btrfs_disk_key disk_key;
3627
3628                         blockptr = btrfs_node_blockptr(cur, slot);
3629                         gen = btrfs_node_ptr_generation(cur, slot);
3630                         if (gen < min_trans) {
3631                                 slot++;
3632                                 continue;
3633                         }
3634                         if (!cache_only)
3635                                 break;
3636
3637                         if (max_key) {
3638                                 btrfs_node_key(cur, &disk_key, slot);
3639                                 if (comp_keys(&disk_key, max_key) >= 0) {
3640                                         ret = 1;
3641                                         goto out;
3642                                 }
3643                         }
3644
3645                         tmp = btrfs_find_tree_block(root, blockptr,
3646                                             btrfs_level_size(root, level - 1));
3647
3648                         if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
3649                                 free_extent_buffer(tmp);
3650                                 break;
3651                         }
3652                         if (tmp)
3653                                 free_extent_buffer(tmp);
3654                         slot++;
3655                 }
3656 find_next_key:
3657                 /*
3658                  * we didn't find a candidate key in this node, walk forward
3659                  * and find another one
3660                  */
3661                 if (slot >= nritems) {
3662                         path->slots[level] = slot;
3663                         sret = btrfs_find_next_key(root, path, min_key, level,
3664                                                   cache_only, min_trans);
3665                         if (sret == 0) {
3666                                 btrfs_release_path(root, path);
3667                                 goto again;
3668                         } else {
3669                                 goto out;
3670                         }
3671                 }
3672                 /* save our key for returning back */
3673                 btrfs_node_key_to_cpu(cur, &found_key, slot);
3674                 path->slots[level] = slot;
3675                 if (level == path->lowest_level) {
3676                         ret = 0;
3677                         unlock_up(path, level, 1);
3678                         goto out;
3679                 }
3680                 cur = read_node_slot(root, cur, slot);
3681
3682                 btrfs_tree_lock(cur);
3683                 path->locks[level - 1] = 1;
3684                 path->nodes[level - 1] = cur;
3685                 unlock_up(path, level, 1);
3686         }
3687 out:
3688         if (ret == 0)
3689                 memcpy(min_key, &found_key, sizeof(found_key));
3690         return ret;
3691 }
3692
3693 /*
3694  * this is similar to btrfs_next_leaf, but does not try to preserve
3695  * and fixup the path.  It looks for and returns the next key in the
3696  * tree based on the current path and the cache_only and min_trans
3697  * parameters.
3698  *
3699  * 0 is returned if another key is found, < 0 if there are any errors
3700  * and 1 is returned if there are no higher keys in the tree
3701  *
3702  * path->keep_locks should be set to 1 on the search made before
3703  * calling this function.
3704  */
3705 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3706                         struct btrfs_key *key, int lowest_level,
3707                         int cache_only, u64 min_trans)
3708 {
3709         int level = lowest_level;
3710         int slot;
3711         struct extent_buffer *c;
3712
3713         WARN_ON(!path->keep_locks);
3714         while(level < BTRFS_MAX_LEVEL) {
3715                 if (!path->nodes[level])
3716                         return 1;
3717
3718                 slot = path->slots[level] + 1;
3719                 c = path->nodes[level];
3720 next:
3721                 if (slot >= btrfs_header_nritems(c)) {
3722                         level++;
3723                         if (level == BTRFS_MAX_LEVEL) {
3724                                 return 1;
3725                         }
3726                         continue;
3727                 }
3728                 if (level == 0)
3729                         btrfs_item_key_to_cpu(c, key, slot);
3730                 else {
3731                         u64 blockptr = btrfs_node_blockptr(c, slot);
3732                         u64 gen = btrfs_node_ptr_generation(c, slot);
3733
3734                         if (cache_only) {
3735                                 struct extent_buffer *cur;
3736                                 cur = btrfs_find_tree_block(root, blockptr,
3737                                             btrfs_level_size(root, level - 1));
3738                                 if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
3739                                         slot++;
3740                                         if (cur)
3741                                                 free_extent_buffer(cur);
3742                                         goto next;
3743                                 }
3744                                 free_extent_buffer(cur);
3745                         }
3746                         if (gen < min_trans) {
3747                                 slot++;
3748                                 goto next;
3749                         }
3750                         btrfs_node_key_to_cpu(c, key, slot);
3751                 }
3752                 return 0;
3753         }
3754         return 1;
3755 }
3756
3757 /*
3758  * search the tree again to find a leaf with greater keys
3759  * returns 0 if it found something or 1 if there are no greater leaves.
3760  * returns < 0 on io errors.
3761  */
3762 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3763 {
3764         int slot;
3765         int level = 1;
3766         struct extent_buffer *c;
3767         struct extent_buffer *next = NULL;
3768         struct btrfs_key key;
3769         u32 nritems;
3770         int ret;
3771
3772         nritems = btrfs_header_nritems(path->nodes[0]);
3773         if (nritems == 0) {
3774                 return 1;
3775         }
3776
3777         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
3778
3779         btrfs_release_path(root, path);
3780         path->keep_locks = 1;
3781         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3782         path->keep_locks = 0;
3783
3784         if (ret < 0)
3785                 return ret;
3786
3787         nritems = btrfs_header_nritems(path->nodes[0]);
3788         /*
3789          * by releasing the path above we dropped all our locks.  A balance
3790          * could have added more items next to the key that used to be
3791          * at the very end of the block.  So, check again here and
3792          * advance the path if there are now more items available.
3793          */
3794         if (nritems > 0 && path->slots[0] < nritems - 1) {
3795                 path->slots[0]++;
3796                 goto done;
3797         }
3798
3799         while(level < BTRFS_MAX_LEVEL) {
3800                 if (!path->nodes[level])
3801                         return 1;
3802
3803                 slot = path->slots[level] + 1;
3804                 c = path->nodes[level];
3805                 if (slot >= btrfs_header_nritems(c)) {
3806                         level++;
3807                         if (level == BTRFS_MAX_LEVEL) {
3808                                 return 1;
3809                         }
3810                         continue;
3811                 }
3812
3813                 if (next) {
3814                         btrfs_tree_unlock(next);
3815                         free_extent_buffer(next);
3816                 }
3817
3818                 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3819                     path->reada)
3820                         reada_for_search(root, path, level, slot, 0);
3821
3822                 next = read_node_slot(root, c, slot);
3823                 if (!path->skip_locking) {
3824                         WARN_ON(!btrfs_tree_locked(c));
3825                         btrfs_tree_lock(next);
3826                 }
3827                 break;
3828         }
3829         path->slots[level] = slot;
3830         while(1) {
3831                 level--;
3832                 c = path->nodes[level];
3833                 if (path->locks[level])
3834                         btrfs_tree_unlock(c);
3835                 free_extent_buffer(c);
3836                 path->nodes[level] = next;
3837                 path->slots[level] = 0;
3838                 if (!path->skip_locking)
3839                         path->locks[level] = 1;
3840                 if (!level)
3841                         break;
3842                 if (level == 1 && path->locks[1] && path->reada)
3843                         reada_for_search(root, path, level, slot, 0);
3844                 next = read_node_slot(root, next, 0);
3845                 if (!path->skip_locking) {
3846                         WARN_ON(!btrfs_tree_locked(path->nodes[level]));
3847                         btrfs_tree_lock(next);
3848                 }
3849         }
3850 done:
3851         unlock_up(path, 0, 1);
3852         return 0;
3853 }
3854
3855 /*
3856  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
3857  * searching until it gets past min_objectid or finds an item of 'type'
3858  *
3859  * returns 0 if something is found, 1 if nothing was found and < 0 on error
3860  */
3861 int btrfs_previous_item(struct btrfs_root *root,
3862                         struct btrfs_path *path, u64 min_objectid,
3863                         int type)
3864 {
3865         struct btrfs_key found_key;
3866         struct extent_buffer *leaf;
3867         u32 nritems;
3868         int ret;
3869
3870         while(1) {
3871                 if (path->slots[0] == 0) {
3872                         ret = btrfs_prev_leaf(root, path);
3873                         if (ret != 0)
3874                                 return ret;
3875                 } else {
3876                         path->slots[0]--;
3877                 }
3878                 leaf = path->nodes[0];
3879                 nritems = btrfs_header_nritems(leaf);
3880                 if (nritems == 0)
3881                         return 1;
3882                 if (path->slots[0] == nritems)
3883                         path->slots[0]--;
3884
3885                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3886                 if (found_key.type == type)
3887                         return 0;
3888                 if (found_key.objectid < min_objectid)
3889                         break;
3890                 if (found_key.objectid == min_objectid &&
3891                     found_key.type < type)
3892                         break;
3893         }
3894         return 1;
3895 }