]> Pileus Git - ~andy/linux/blob - fs/btrfs/disk-io.c
Merge tag 'v3.13-rc6' into for-3.14/core
[~andy/linux] / fs / btrfs / disk-io.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/fs.h>
20 #include <linux/blkdev.h>
21 #include <linux/scatterlist.h>
22 #include <linux/swap.h>
23 #include <linux/radix-tree.h>
24 #include <linux/writeback.h>
25 #include <linux/buffer_head.h>
26 #include <linux/workqueue.h>
27 #include <linux/kthread.h>
28 #include <linux/freezer.h>
29 #include <linux/crc32c.h>
30 #include <linux/slab.h>
31 #include <linux/migrate.h>
32 #include <linux/ratelimit.h>
33 #include <linux/uuid.h>
34 #include <linux/semaphore.h>
35 #include <asm/unaligned.h>
36 #include "ctree.h"
37 #include "disk-io.h"
38 #include "transaction.h"
39 #include "btrfs_inode.h"
40 #include "volumes.h"
41 #include "print-tree.h"
42 #include "async-thread.h"
43 #include "locking.h"
44 #include "tree-log.h"
45 #include "free-space-cache.h"
46 #include "inode-map.h"
47 #include "check-integrity.h"
48 #include "rcu-string.h"
49 #include "dev-replace.h"
50 #include "raid56.h"
51
52 #ifdef CONFIG_X86
53 #include <asm/cpufeature.h>
54 #endif
55
56 static struct extent_io_ops btree_extent_io_ops;
57 static void end_workqueue_fn(struct btrfs_work *work);
58 static void free_fs_root(struct btrfs_root *root);
59 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
60                                     int read_only);
61 static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
62                                              struct btrfs_root *root);
63 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
64 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
65                                       struct btrfs_root *root);
66 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
67 static int btrfs_destroy_marked_extents(struct btrfs_root *root,
68                                         struct extent_io_tree *dirty_pages,
69                                         int mark);
70 static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
71                                        struct extent_io_tree *pinned_extents);
72 static int btrfs_cleanup_transaction(struct btrfs_root *root);
73 static void btrfs_error_commit_super(struct btrfs_root *root);
74
75 /*
76  * end_io_wq structs are used to do processing in task context when an IO is
77  * complete.  This is used during reads to verify checksums, and it is used
78  * by writes to insert metadata for new file extents after IO is complete.
79  */
80 struct end_io_wq {
81         struct bio *bio;
82         bio_end_io_t *end_io;
83         void *private;
84         struct btrfs_fs_info *info;
85         int error;
86         int metadata;
87         struct list_head list;
88         struct btrfs_work work;
89 };
90
91 /*
92  * async submit bios are used to offload expensive checksumming
93  * onto the worker threads.  They checksum file and metadata bios
94  * just before they are sent down the IO stack.
95  */
96 struct async_submit_bio {
97         struct inode *inode;
98         struct bio *bio;
99         struct list_head list;
100         extent_submit_bio_hook_t *submit_bio_start;
101         extent_submit_bio_hook_t *submit_bio_done;
102         int rw;
103         int mirror_num;
104         unsigned long bio_flags;
105         /*
106          * bio_offset is optional, can be used if the pages in the bio
107          * can't tell us where in the file the bio should go
108          */
109         u64 bio_offset;
110         struct btrfs_work work;
111         int error;
112 };
113
114 /*
115  * Lockdep class keys for extent_buffer->lock's in this root.  For a given
116  * eb, the lockdep key is determined by the btrfs_root it belongs to and
117  * the level the eb occupies in the tree.
118  *
119  * Different roots are used for different purposes and may nest inside each
120  * other and they require separate keysets.  As lockdep keys should be
121  * static, assign keysets according to the purpose of the root as indicated
122  * by btrfs_root->objectid.  This ensures that all special purpose roots
123  * have separate keysets.
124  *
125  * Lock-nesting across peer nodes is always done with the immediate parent
126  * node locked thus preventing deadlock.  As lockdep doesn't know this, use
127  * subclass to avoid triggering lockdep warning in such cases.
128  *
129  * The key is set by the readpage_end_io_hook after the buffer has passed
130  * csum validation but before the pages are unlocked.  It is also set by
131  * btrfs_init_new_buffer on freshly allocated blocks.
132  *
133  * We also add a check to make sure the highest level of the tree is the
134  * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
135  * needs update as well.
136  */
137 #ifdef CONFIG_DEBUG_LOCK_ALLOC
138 # if BTRFS_MAX_LEVEL != 8
139 #  error
140 # endif
141
142 static struct btrfs_lockdep_keyset {
143         u64                     id;             /* root objectid */
144         const char              *name_stem;     /* lock name stem */
145         char                    names[BTRFS_MAX_LEVEL + 1][20];
146         struct lock_class_key   keys[BTRFS_MAX_LEVEL + 1];
147 } btrfs_lockdep_keysets[] = {
148         { .id = BTRFS_ROOT_TREE_OBJECTID,       .name_stem = "root"     },
149         { .id = BTRFS_EXTENT_TREE_OBJECTID,     .name_stem = "extent"   },
150         { .id = BTRFS_CHUNK_TREE_OBJECTID,      .name_stem = "chunk"    },
151         { .id = BTRFS_DEV_TREE_OBJECTID,        .name_stem = "dev"      },
152         { .id = BTRFS_FS_TREE_OBJECTID,         .name_stem = "fs"       },
153         { .id = BTRFS_CSUM_TREE_OBJECTID,       .name_stem = "csum"     },
154         { .id = BTRFS_QUOTA_TREE_OBJECTID,      .name_stem = "quota"    },
155         { .id = BTRFS_TREE_LOG_OBJECTID,        .name_stem = "log"      },
156         { .id = BTRFS_TREE_RELOC_OBJECTID,      .name_stem = "treloc"   },
157         { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc"   },
158         { .id = BTRFS_UUID_TREE_OBJECTID,       .name_stem = "uuid"     },
159         { .id = 0,                              .name_stem = "tree"     },
160 };
161
162 void __init btrfs_init_lockdep(void)
163 {
164         int i, j;
165
166         /* initialize lockdep class names */
167         for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
168                 struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
169
170                 for (j = 0; j < ARRAY_SIZE(ks->names); j++)
171                         snprintf(ks->names[j], sizeof(ks->names[j]),
172                                  "btrfs-%s-%02d", ks->name_stem, j);
173         }
174 }
175
176 void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
177                                     int level)
178 {
179         struct btrfs_lockdep_keyset *ks;
180
181         BUG_ON(level >= ARRAY_SIZE(ks->keys));
182
183         /* find the matching keyset, id 0 is the default entry */
184         for (ks = btrfs_lockdep_keysets; ks->id; ks++)
185                 if (ks->id == objectid)
186                         break;
187
188         lockdep_set_class_and_name(&eb->lock,
189                                    &ks->keys[level], ks->names[level]);
190 }
191
192 #endif
193
194 /*
195  * extents on the btree inode are pretty simple, there's one extent
196  * that covers the entire device
197  */
198 static struct extent_map *btree_get_extent(struct inode *inode,
199                 struct page *page, size_t pg_offset, u64 start, u64 len,
200                 int create)
201 {
202         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
203         struct extent_map *em;
204         int ret;
205
206         read_lock(&em_tree->lock);
207         em = lookup_extent_mapping(em_tree, start, len);
208         if (em) {
209                 em->bdev =
210                         BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
211                 read_unlock(&em_tree->lock);
212                 goto out;
213         }
214         read_unlock(&em_tree->lock);
215
216         em = alloc_extent_map();
217         if (!em) {
218                 em = ERR_PTR(-ENOMEM);
219                 goto out;
220         }
221         em->start = 0;
222         em->len = (u64)-1;
223         em->block_len = (u64)-1;
224         em->block_start = 0;
225         em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
226
227         write_lock(&em_tree->lock);
228         ret = add_extent_mapping(em_tree, em, 0);
229         if (ret == -EEXIST) {
230                 free_extent_map(em);
231                 em = lookup_extent_mapping(em_tree, start, len);
232                 if (!em)
233                         em = ERR_PTR(-EIO);
234         } else if (ret) {
235                 free_extent_map(em);
236                 em = ERR_PTR(ret);
237         }
238         write_unlock(&em_tree->lock);
239
240 out:
241         return em;
242 }
243
244 u32 btrfs_csum_data(char *data, u32 seed, size_t len)
245 {
246         return crc32c(seed, data, len);
247 }
248
249 void btrfs_csum_final(u32 crc, char *result)
250 {
251         put_unaligned_le32(~crc, result);
252 }
253
254 /*
255  * compute the csum for a btree block, and either verify it or write it
256  * into the csum field of the block.
257  */
258 static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
259                            int verify)
260 {
261         u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
262         char *result = NULL;
263         unsigned long len;
264         unsigned long cur_len;
265         unsigned long offset = BTRFS_CSUM_SIZE;
266         char *kaddr;
267         unsigned long map_start;
268         unsigned long map_len;
269         int err;
270         u32 crc = ~(u32)0;
271         unsigned long inline_result;
272
273         len = buf->len - offset;
274         while (len > 0) {
275                 err = map_private_extent_buffer(buf, offset, 32,
276                                         &kaddr, &map_start, &map_len);
277                 if (err)
278                         return 1;
279                 cur_len = min(len, map_len - (offset - map_start));
280                 crc = btrfs_csum_data(kaddr + offset - map_start,
281                                       crc, cur_len);
282                 len -= cur_len;
283                 offset += cur_len;
284         }
285         if (csum_size > sizeof(inline_result)) {
286                 result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
287                 if (!result)
288                         return 1;
289         } else {
290                 result = (char *)&inline_result;
291         }
292
293         btrfs_csum_final(crc, result);
294
295         if (verify) {
296                 if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
297                         u32 val;
298                         u32 found = 0;
299                         memcpy(&found, result, csum_size);
300
301                         read_extent_buffer(buf, &val, 0, csum_size);
302                         printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
303                                        "failed on %llu wanted %X found %X "
304                                        "level %d\n",
305                                        root->fs_info->sb->s_id, buf->start,
306                                        val, found, btrfs_header_level(buf));
307                         if (result != (char *)&inline_result)
308                                 kfree(result);
309                         return 1;
310                 }
311         } else {
312                 write_extent_buffer(buf, result, 0, csum_size);
313         }
314         if (result != (char *)&inline_result)
315                 kfree(result);
316         return 0;
317 }
318
319 /*
320  * we can't consider a given block up to date unless the transid of the
321  * block matches the transid in the parent node's pointer.  This is how we
322  * detect blocks that either didn't get written at all or got written
323  * in the wrong place.
324  */
325 static int verify_parent_transid(struct extent_io_tree *io_tree,
326                                  struct extent_buffer *eb, u64 parent_transid,
327                                  int atomic)
328 {
329         struct extent_state *cached_state = NULL;
330         int ret;
331
332         if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
333                 return 0;
334
335         if (atomic)
336                 return -EAGAIN;
337
338         lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
339                          0, &cached_state);
340         if (extent_buffer_uptodate(eb) &&
341             btrfs_header_generation(eb) == parent_transid) {
342                 ret = 0;
343                 goto out;
344         }
345         printk_ratelimited("parent transid verify failed on %llu wanted %llu "
346                        "found %llu\n",
347                        eb->start, parent_transid, btrfs_header_generation(eb));
348         ret = 1;
349         clear_extent_buffer_uptodate(eb);
350 out:
351         unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
352                              &cached_state, GFP_NOFS);
353         return ret;
354 }
355
356 /*
357  * Return 0 if the superblock checksum type matches the checksum value of that
358  * algorithm. Pass the raw disk superblock data.
359  */
360 static int btrfs_check_super_csum(char *raw_disk_sb)
361 {
362         struct btrfs_super_block *disk_sb =
363                 (struct btrfs_super_block *)raw_disk_sb;
364         u16 csum_type = btrfs_super_csum_type(disk_sb);
365         int ret = 0;
366
367         if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
368                 u32 crc = ~(u32)0;
369                 const int csum_size = sizeof(crc);
370                 char result[csum_size];
371
372                 /*
373                  * The super_block structure does not span the whole
374                  * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
375                  * is filled with zeros and is included in the checkum.
376                  */
377                 crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
378                                 crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
379                 btrfs_csum_final(crc, result);
380
381                 if (memcmp(raw_disk_sb, result, csum_size))
382                         ret = 1;
383
384                 if (ret && btrfs_super_generation(disk_sb) < 10) {
385                         printk(KERN_WARNING "btrfs: super block crcs don't match, older mkfs detected\n");
386                         ret = 0;
387                 }
388         }
389
390         if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
391                 printk(KERN_ERR "btrfs: unsupported checksum algorithm %u\n",
392                                 csum_type);
393                 ret = 1;
394         }
395
396         return ret;
397 }
398
399 /*
400  * helper to read a given tree block, doing retries as required when
401  * the checksums don't match and we have alternate mirrors to try.
402  */
403 static int btree_read_extent_buffer_pages(struct btrfs_root *root,
404                                           struct extent_buffer *eb,
405                                           u64 start, u64 parent_transid)
406 {
407         struct extent_io_tree *io_tree;
408         int failed = 0;
409         int ret;
410         int num_copies = 0;
411         int mirror_num = 0;
412         int failed_mirror = 0;
413
414         clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
415         io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
416         while (1) {
417                 ret = read_extent_buffer_pages(io_tree, eb, start,
418                                                WAIT_COMPLETE,
419                                                btree_get_extent, mirror_num);
420                 if (!ret) {
421                         if (!verify_parent_transid(io_tree, eb,
422                                                    parent_transid, 0))
423                                 break;
424                         else
425                                 ret = -EIO;
426                 }
427
428                 /*
429                  * This buffer's crc is fine, but its contents are corrupted, so
430                  * there is no reason to read the other copies, they won't be
431                  * any less wrong.
432                  */
433                 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
434                         break;
435
436                 num_copies = btrfs_num_copies(root->fs_info,
437                                               eb->start, eb->len);
438                 if (num_copies == 1)
439                         break;
440
441                 if (!failed_mirror) {
442                         failed = 1;
443                         failed_mirror = eb->read_mirror;
444                 }
445
446                 mirror_num++;
447                 if (mirror_num == failed_mirror)
448                         mirror_num++;
449
450                 if (mirror_num > num_copies)
451                         break;
452         }
453
454         if (failed && !ret && failed_mirror)
455                 repair_eb_io_failure(root, eb, failed_mirror);
456
457         return ret;
458 }
459
460 /*
461  * checksum a dirty tree block before IO.  This has extra checks to make sure
462  * we only fill in the checksum field in the first page of a multi-page block
463  */
464
465 static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
466 {
467         struct extent_io_tree *tree;
468         u64 start = page_offset(page);
469         u64 found_start;
470         struct extent_buffer *eb;
471
472         tree = &BTRFS_I(page->mapping->host)->io_tree;
473
474         eb = (struct extent_buffer *)page->private;
475         if (page != eb->pages[0])
476                 return 0;
477         found_start = btrfs_header_bytenr(eb);
478         if (WARN_ON(found_start != start || !PageUptodate(page)))
479                 return 0;
480         csum_tree_block(root, eb, 0);
481         return 0;
482 }
483
484 static int check_tree_block_fsid(struct btrfs_root *root,
485                                  struct extent_buffer *eb)
486 {
487         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
488         u8 fsid[BTRFS_UUID_SIZE];
489         int ret = 1;
490
491         read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
492         while (fs_devices) {
493                 if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
494                         ret = 0;
495                         break;
496                 }
497                 fs_devices = fs_devices->seed;
498         }
499         return ret;
500 }
501
502 #define CORRUPT(reason, eb, root, slot)                         \
503         printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
504                "root=%llu, slot=%d\n", reason,                  \
505                btrfs_header_bytenr(eb), root->objectid, slot)
506
507 static noinline int check_leaf(struct btrfs_root *root,
508                                struct extent_buffer *leaf)
509 {
510         struct btrfs_key key;
511         struct btrfs_key leaf_key;
512         u32 nritems = btrfs_header_nritems(leaf);
513         int slot;
514
515         if (nritems == 0)
516                 return 0;
517
518         /* Check the 0 item */
519         if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
520             BTRFS_LEAF_DATA_SIZE(root)) {
521                 CORRUPT("invalid item offset size pair", leaf, root, 0);
522                 return -EIO;
523         }
524
525         /*
526          * Check to make sure each items keys are in the correct order and their
527          * offsets make sense.  We only have to loop through nritems-1 because
528          * we check the current slot against the next slot, which verifies the
529          * next slot's offset+size makes sense and that the current's slot
530          * offset is correct.
531          */
532         for (slot = 0; slot < nritems - 1; slot++) {
533                 btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
534                 btrfs_item_key_to_cpu(leaf, &key, slot + 1);
535
536                 /* Make sure the keys are in the right order */
537                 if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
538                         CORRUPT("bad key order", leaf, root, slot);
539                         return -EIO;
540                 }
541
542                 /*
543                  * Make sure the offset and ends are right, remember that the
544                  * item data starts at the end of the leaf and grows towards the
545                  * front.
546                  */
547                 if (btrfs_item_offset_nr(leaf, slot) !=
548                         btrfs_item_end_nr(leaf, slot + 1)) {
549                         CORRUPT("slot offset bad", leaf, root, slot);
550                         return -EIO;
551                 }
552
553                 /*
554                  * Check to make sure that we don't point outside of the leaf,
555                  * just incase all the items are consistent to eachother, but
556                  * all point outside of the leaf.
557                  */
558                 if (btrfs_item_end_nr(leaf, slot) >
559                     BTRFS_LEAF_DATA_SIZE(root)) {
560                         CORRUPT("slot end outside of leaf", leaf, root, slot);
561                         return -EIO;
562                 }
563         }
564
565         return 0;
566 }
567
568 static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
569                                       u64 phy_offset, struct page *page,
570                                       u64 start, u64 end, int mirror)
571 {
572         struct extent_io_tree *tree;
573         u64 found_start;
574         int found_level;
575         struct extent_buffer *eb;
576         struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
577         int ret = 0;
578         int reads_done;
579
580         if (!page->private)
581                 goto out;
582
583         tree = &BTRFS_I(page->mapping->host)->io_tree;
584         eb = (struct extent_buffer *)page->private;
585
586         /* the pending IO might have been the only thing that kept this buffer
587          * in memory.  Make sure we have a ref for all this other checks
588          */
589         extent_buffer_get(eb);
590
591         reads_done = atomic_dec_and_test(&eb->io_pages);
592         if (!reads_done)
593                 goto err;
594
595         eb->read_mirror = mirror;
596         if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
597                 ret = -EIO;
598                 goto err;
599         }
600
601         found_start = btrfs_header_bytenr(eb);
602         if (found_start != eb->start) {
603                 printk_ratelimited(KERN_INFO "btrfs bad tree block start "
604                                "%llu %llu\n",
605                                found_start, eb->start);
606                 ret = -EIO;
607                 goto err;
608         }
609         if (check_tree_block_fsid(root, eb)) {
610                 printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
611                                eb->start);
612                 ret = -EIO;
613                 goto err;
614         }
615         found_level = btrfs_header_level(eb);
616         if (found_level >= BTRFS_MAX_LEVEL) {
617                 btrfs_info(root->fs_info, "bad tree block level %d\n",
618                            (int)btrfs_header_level(eb));
619                 ret = -EIO;
620                 goto err;
621         }
622
623         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
624                                        eb, found_level);
625
626         ret = csum_tree_block(root, eb, 1);
627         if (ret) {
628                 ret = -EIO;
629                 goto err;
630         }
631
632         /*
633          * If this is a leaf block and it is corrupt, set the corrupt bit so
634          * that we don't try and read the other copies of this block, just
635          * return -EIO.
636          */
637         if (found_level == 0 && check_leaf(root, eb)) {
638                 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
639                 ret = -EIO;
640         }
641
642         if (!ret)
643                 set_extent_buffer_uptodate(eb);
644 err:
645         if (reads_done &&
646             test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
647                 btree_readahead_hook(root, eb, eb->start, ret);
648
649         if (ret) {
650                 /*
651                  * our io error hook is going to dec the io pages
652                  * again, we have to make sure it has something
653                  * to decrement
654                  */
655                 atomic_inc(&eb->io_pages);
656                 clear_extent_buffer_uptodate(eb);
657         }
658         free_extent_buffer(eb);
659 out:
660         return ret;
661 }
662
663 static int btree_io_failed_hook(struct page *page, int failed_mirror)
664 {
665         struct extent_buffer *eb;
666         struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
667
668         eb = (struct extent_buffer *)page->private;
669         set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
670         eb->read_mirror = failed_mirror;
671         atomic_dec(&eb->io_pages);
672         if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
673                 btree_readahead_hook(root, eb, eb->start, -EIO);
674         return -EIO;    /* we fixed nothing */
675 }
676
677 static void end_workqueue_bio(struct bio *bio, int err)
678 {
679         struct end_io_wq *end_io_wq = bio->bi_private;
680         struct btrfs_fs_info *fs_info;
681
682         fs_info = end_io_wq->info;
683         end_io_wq->error = err;
684         end_io_wq->work.func = end_workqueue_fn;
685         end_io_wq->work.flags = 0;
686
687         if (bio->bi_rw & REQ_WRITE) {
688                 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
689                         btrfs_queue_worker(&fs_info->endio_meta_write_workers,
690                                            &end_io_wq->work);
691                 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
692                         btrfs_queue_worker(&fs_info->endio_freespace_worker,
693                                            &end_io_wq->work);
694                 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
695                         btrfs_queue_worker(&fs_info->endio_raid56_workers,
696                                            &end_io_wq->work);
697                 else
698                         btrfs_queue_worker(&fs_info->endio_write_workers,
699                                            &end_io_wq->work);
700         } else {
701                 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
702                         btrfs_queue_worker(&fs_info->endio_raid56_workers,
703                                            &end_io_wq->work);
704                 else if (end_io_wq->metadata)
705                         btrfs_queue_worker(&fs_info->endio_meta_workers,
706                                            &end_io_wq->work);
707                 else
708                         btrfs_queue_worker(&fs_info->endio_workers,
709                                            &end_io_wq->work);
710         }
711 }
712
713 /*
714  * For the metadata arg you want
715  *
716  * 0 - if data
717  * 1 - if normal metadta
718  * 2 - if writing to the free space cache area
719  * 3 - raid parity work
720  */
721 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
722                         int metadata)
723 {
724         struct end_io_wq *end_io_wq;
725         end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
726         if (!end_io_wq)
727                 return -ENOMEM;
728
729         end_io_wq->private = bio->bi_private;
730         end_io_wq->end_io = bio->bi_end_io;
731         end_io_wq->info = info;
732         end_io_wq->error = 0;
733         end_io_wq->bio = bio;
734         end_io_wq->metadata = metadata;
735
736         bio->bi_private = end_io_wq;
737         bio->bi_end_io = end_workqueue_bio;
738         return 0;
739 }
740
741 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
742 {
743         unsigned long limit = min_t(unsigned long,
744                                     info->workers.max_workers,
745                                     info->fs_devices->open_devices);
746         return 256 * limit;
747 }
748
749 static void run_one_async_start(struct btrfs_work *work)
750 {
751         struct async_submit_bio *async;
752         int ret;
753
754         async = container_of(work, struct  async_submit_bio, work);
755         ret = async->submit_bio_start(async->inode, async->rw, async->bio,
756                                       async->mirror_num, async->bio_flags,
757                                       async->bio_offset);
758         if (ret)
759                 async->error = ret;
760 }
761
762 static void run_one_async_done(struct btrfs_work *work)
763 {
764         struct btrfs_fs_info *fs_info;
765         struct async_submit_bio *async;
766         int limit;
767
768         async = container_of(work, struct  async_submit_bio, work);
769         fs_info = BTRFS_I(async->inode)->root->fs_info;
770
771         limit = btrfs_async_submit_limit(fs_info);
772         limit = limit * 2 / 3;
773
774         if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
775             waitqueue_active(&fs_info->async_submit_wait))
776                 wake_up(&fs_info->async_submit_wait);
777
778         /* If an error occured we just want to clean up the bio and move on */
779         if (async->error) {
780                 bio_endio(async->bio, async->error);
781                 return;
782         }
783
784         async->submit_bio_done(async->inode, async->rw, async->bio,
785                                async->mirror_num, async->bio_flags,
786                                async->bio_offset);
787 }
788
789 static void run_one_async_free(struct btrfs_work *work)
790 {
791         struct async_submit_bio *async;
792
793         async = container_of(work, struct  async_submit_bio, work);
794         kfree(async);
795 }
796
797 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
798                         int rw, struct bio *bio, int mirror_num,
799                         unsigned long bio_flags,
800                         u64 bio_offset,
801                         extent_submit_bio_hook_t *submit_bio_start,
802                         extent_submit_bio_hook_t *submit_bio_done)
803 {
804         struct async_submit_bio *async;
805
806         async = kmalloc(sizeof(*async), GFP_NOFS);
807         if (!async)
808                 return -ENOMEM;
809
810         async->inode = inode;
811         async->rw = rw;
812         async->bio = bio;
813         async->mirror_num = mirror_num;
814         async->submit_bio_start = submit_bio_start;
815         async->submit_bio_done = submit_bio_done;
816
817         async->work.func = run_one_async_start;
818         async->work.ordered_func = run_one_async_done;
819         async->work.ordered_free = run_one_async_free;
820
821         async->work.flags = 0;
822         async->bio_flags = bio_flags;
823         async->bio_offset = bio_offset;
824
825         async->error = 0;
826
827         atomic_inc(&fs_info->nr_async_submits);
828
829         if (rw & REQ_SYNC)
830                 btrfs_set_work_high_prio(&async->work);
831
832         btrfs_queue_worker(&fs_info->workers, &async->work);
833
834         while (atomic_read(&fs_info->async_submit_draining) &&
835               atomic_read(&fs_info->nr_async_submits)) {
836                 wait_event(fs_info->async_submit_wait,
837                            (atomic_read(&fs_info->nr_async_submits) == 0));
838         }
839
840         return 0;
841 }
842
843 static int btree_csum_one_bio(struct bio *bio)
844 {
845         struct bio_vec *bvec;
846         struct btrfs_root *root;
847         int i, ret = 0;
848
849         bio_for_each_segment_all(bvec, bio, i) {
850                 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
851                 ret = csum_dirty_buffer(root, bvec->bv_page);
852                 if (ret)
853                         break;
854         }
855
856         return ret;
857 }
858
859 static int __btree_submit_bio_start(struct inode *inode, int rw,
860                                     struct bio *bio, int mirror_num,
861                                     unsigned long bio_flags,
862                                     u64 bio_offset)
863 {
864         /*
865          * when we're called for a write, we're already in the async
866          * submission context.  Just jump into btrfs_map_bio
867          */
868         return btree_csum_one_bio(bio);
869 }
870
871 static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
872                                  int mirror_num, unsigned long bio_flags,
873                                  u64 bio_offset)
874 {
875         int ret;
876
877         /*
878          * when we're called for a write, we're already in the async
879          * submission context.  Just jump into btrfs_map_bio
880          */
881         ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
882         if (ret)
883                 bio_endio(bio, ret);
884         return ret;
885 }
886
887 static int check_async_write(struct inode *inode, unsigned long bio_flags)
888 {
889         if (bio_flags & EXTENT_BIO_TREE_LOG)
890                 return 0;
891 #ifdef CONFIG_X86
892         if (cpu_has_xmm4_2)
893                 return 0;
894 #endif
895         return 1;
896 }
897
898 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
899                                  int mirror_num, unsigned long bio_flags,
900                                  u64 bio_offset)
901 {
902         int async = check_async_write(inode, bio_flags);
903         int ret;
904
905         if (!(rw & REQ_WRITE)) {
906                 /*
907                  * called for a read, do the setup so that checksum validation
908                  * can happen in the async kernel threads
909                  */
910                 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
911                                           bio, 1);
912                 if (ret)
913                         goto out_w_error;
914                 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
915                                     mirror_num, 0);
916         } else if (!async) {
917                 ret = btree_csum_one_bio(bio);
918                 if (ret)
919                         goto out_w_error;
920                 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
921                                     mirror_num, 0);
922         } else {
923                 /*
924                  * kthread helpers are used to submit writes so that
925                  * checksumming can happen in parallel across all CPUs
926                  */
927                 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
928                                           inode, rw, bio, mirror_num, 0,
929                                           bio_offset,
930                                           __btree_submit_bio_start,
931                                           __btree_submit_bio_done);
932         }
933
934         if (ret) {
935 out_w_error:
936                 bio_endio(bio, ret);
937         }
938         return ret;
939 }
940
941 #ifdef CONFIG_MIGRATION
942 static int btree_migratepage(struct address_space *mapping,
943                         struct page *newpage, struct page *page,
944                         enum migrate_mode mode)
945 {
946         /*
947          * we can't safely write a btree page from here,
948          * we haven't done the locking hook
949          */
950         if (PageDirty(page))
951                 return -EAGAIN;
952         /*
953          * Buffers may be managed in a filesystem specific way.
954          * We must have no buffers or drop them.
955          */
956         if (page_has_private(page) &&
957             !try_to_release_page(page, GFP_KERNEL))
958                 return -EAGAIN;
959         return migrate_page(mapping, newpage, page, mode);
960 }
961 #endif
962
963
964 static int btree_writepages(struct address_space *mapping,
965                             struct writeback_control *wbc)
966 {
967         struct extent_io_tree *tree;
968         struct btrfs_fs_info *fs_info;
969         int ret;
970
971         tree = &BTRFS_I(mapping->host)->io_tree;
972         if (wbc->sync_mode == WB_SYNC_NONE) {
973
974                 if (wbc->for_kupdate)
975                         return 0;
976
977                 fs_info = BTRFS_I(mapping->host)->root->fs_info;
978                 /* this is a bit racy, but that's ok */
979                 ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
980                                              BTRFS_DIRTY_METADATA_THRESH);
981                 if (ret < 0)
982                         return 0;
983         }
984         return btree_write_cache_pages(mapping, wbc);
985 }
986
987 static int btree_readpage(struct file *file, struct page *page)
988 {
989         struct extent_io_tree *tree;
990         tree = &BTRFS_I(page->mapping->host)->io_tree;
991         return extent_read_full_page(tree, page, btree_get_extent, 0);
992 }
993
994 static int btree_releasepage(struct page *page, gfp_t gfp_flags)
995 {
996         if (PageWriteback(page) || PageDirty(page))
997                 return 0;
998
999         return try_release_extent_buffer(page);
1000 }
1001
1002 static void btree_invalidatepage(struct page *page, unsigned int offset,
1003                                  unsigned int length)
1004 {
1005         struct extent_io_tree *tree;
1006         tree = &BTRFS_I(page->mapping->host)->io_tree;
1007         extent_invalidatepage(tree, page, offset);
1008         btree_releasepage(page, GFP_NOFS);
1009         if (PagePrivate(page)) {
1010                 printk(KERN_WARNING "btrfs warning page private not zero "
1011                        "on page %llu\n", (unsigned long long)page_offset(page));
1012                 ClearPagePrivate(page);
1013                 set_page_private(page, 0);
1014                 page_cache_release(page);
1015         }
1016 }
1017
1018 static int btree_set_page_dirty(struct page *page)
1019 {
1020 #ifdef DEBUG
1021         struct extent_buffer *eb;
1022
1023         BUG_ON(!PagePrivate(page));
1024         eb = (struct extent_buffer *)page->private;
1025         BUG_ON(!eb);
1026         BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
1027         BUG_ON(!atomic_read(&eb->refs));
1028         btrfs_assert_tree_locked(eb);
1029 #endif
1030         return __set_page_dirty_nobuffers(page);
1031 }
1032
1033 static const struct address_space_operations btree_aops = {
1034         .readpage       = btree_readpage,
1035         .writepages     = btree_writepages,
1036         .releasepage    = btree_releasepage,
1037         .invalidatepage = btree_invalidatepage,
1038 #ifdef CONFIG_MIGRATION
1039         .migratepage    = btree_migratepage,
1040 #endif
1041         .set_page_dirty = btree_set_page_dirty,
1042 };
1043
1044 int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1045                          u64 parent_transid)
1046 {
1047         struct extent_buffer *buf = NULL;
1048         struct inode *btree_inode = root->fs_info->btree_inode;
1049         int ret = 0;
1050
1051         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1052         if (!buf)
1053                 return 0;
1054         read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
1055                                  buf, 0, WAIT_NONE, btree_get_extent, 0);
1056         free_extent_buffer(buf);
1057         return ret;
1058 }
1059
1060 int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1061                          int mirror_num, struct extent_buffer **eb)
1062 {
1063         struct extent_buffer *buf = NULL;
1064         struct inode *btree_inode = root->fs_info->btree_inode;
1065         struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
1066         int ret;
1067
1068         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1069         if (!buf)
1070                 return 0;
1071
1072         set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
1073
1074         ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
1075                                        btree_get_extent, mirror_num);
1076         if (ret) {
1077                 free_extent_buffer(buf);
1078                 return ret;
1079         }
1080
1081         if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
1082                 free_extent_buffer(buf);
1083                 return -EIO;
1084         } else if (extent_buffer_uptodate(buf)) {
1085                 *eb = buf;
1086         } else {
1087                 free_extent_buffer(buf);
1088         }
1089         return 0;
1090 }
1091
1092 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
1093                                             u64 bytenr, u32 blocksize)
1094 {
1095         struct inode *btree_inode = root->fs_info->btree_inode;
1096         struct extent_buffer *eb;
1097         eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, bytenr);
1098         return eb;
1099 }
1100
1101 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
1102                                                  u64 bytenr, u32 blocksize)
1103 {
1104         struct inode *btree_inode = root->fs_info->btree_inode;
1105         struct extent_buffer *eb;
1106
1107         eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
1108                                  bytenr, blocksize);
1109         return eb;
1110 }
1111
1112
1113 int btrfs_write_tree_block(struct extent_buffer *buf)
1114 {
1115         return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
1116                                         buf->start + buf->len - 1);
1117 }
1118
1119 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
1120 {
1121         return filemap_fdatawait_range(buf->pages[0]->mapping,
1122                                        buf->start, buf->start + buf->len - 1);
1123 }
1124
1125 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
1126                                       u32 blocksize, u64 parent_transid)
1127 {
1128         struct extent_buffer *buf = NULL;
1129         int ret;
1130
1131         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1132         if (!buf)
1133                 return NULL;
1134
1135         ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
1136         if (ret) {
1137                 free_extent_buffer(buf);
1138                 return NULL;
1139         }
1140         return buf;
1141
1142 }
1143
1144 void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1145                       struct extent_buffer *buf)
1146 {
1147         struct btrfs_fs_info *fs_info = root->fs_info;
1148
1149         if (btrfs_header_generation(buf) ==
1150             fs_info->running_transaction->transid) {
1151                 btrfs_assert_tree_locked(buf);
1152
1153                 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
1154                         __percpu_counter_add(&fs_info->dirty_metadata_bytes,
1155                                              -buf->len,
1156                                              fs_info->dirty_metadata_batch);
1157                         /* ugh, clear_extent_buffer_dirty needs to lock the page */
1158                         btrfs_set_lock_blocking(buf);
1159                         clear_extent_buffer_dirty(buf);
1160                 }
1161         }
1162 }
1163
1164 static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1165                          u32 stripesize, struct btrfs_root *root,
1166                          struct btrfs_fs_info *fs_info,
1167                          u64 objectid)
1168 {
1169         root->node = NULL;
1170         root->commit_root = NULL;
1171         root->sectorsize = sectorsize;
1172         root->nodesize = nodesize;
1173         root->leafsize = leafsize;
1174         root->stripesize = stripesize;
1175         root->ref_cows = 0;
1176         root->track_dirty = 0;
1177         root->in_radix = 0;
1178         root->orphan_item_inserted = 0;
1179         root->orphan_cleanup_state = 0;
1180
1181         root->objectid = objectid;
1182         root->last_trans = 0;
1183         root->highest_objectid = 0;
1184         root->nr_delalloc_inodes = 0;
1185         root->nr_ordered_extents = 0;
1186         root->name = NULL;
1187         root->inode_tree = RB_ROOT;
1188         INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
1189         root->block_rsv = NULL;
1190         root->orphan_block_rsv = NULL;
1191
1192         INIT_LIST_HEAD(&root->dirty_list);
1193         INIT_LIST_HEAD(&root->root_list);
1194         INIT_LIST_HEAD(&root->delalloc_inodes);
1195         INIT_LIST_HEAD(&root->delalloc_root);
1196         INIT_LIST_HEAD(&root->ordered_extents);
1197         INIT_LIST_HEAD(&root->ordered_root);
1198         INIT_LIST_HEAD(&root->logged_list[0]);
1199         INIT_LIST_HEAD(&root->logged_list[1]);
1200         spin_lock_init(&root->orphan_lock);
1201         spin_lock_init(&root->inode_lock);
1202         spin_lock_init(&root->delalloc_lock);
1203         spin_lock_init(&root->ordered_extent_lock);
1204         spin_lock_init(&root->accounting_lock);
1205         spin_lock_init(&root->log_extents_lock[0]);
1206         spin_lock_init(&root->log_extents_lock[1]);
1207         mutex_init(&root->objectid_mutex);
1208         mutex_init(&root->log_mutex);
1209         init_waitqueue_head(&root->log_writer_wait);
1210         init_waitqueue_head(&root->log_commit_wait[0]);
1211         init_waitqueue_head(&root->log_commit_wait[1]);
1212         atomic_set(&root->log_commit[0], 0);
1213         atomic_set(&root->log_commit[1], 0);
1214         atomic_set(&root->log_writers, 0);
1215         atomic_set(&root->log_batch, 0);
1216         atomic_set(&root->orphan_inodes, 0);
1217         atomic_set(&root->refs, 1);
1218         root->log_transid = 0;
1219         root->last_log_commit = 0;
1220         if (fs_info)
1221                 extent_io_tree_init(&root->dirty_log_pages,
1222                                      fs_info->btree_inode->i_mapping);
1223
1224         memset(&root->root_key, 0, sizeof(root->root_key));
1225         memset(&root->root_item, 0, sizeof(root->root_item));
1226         memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
1227         memset(&root->root_kobj, 0, sizeof(root->root_kobj));
1228         if (fs_info)
1229                 root->defrag_trans_start = fs_info->generation;
1230         else
1231                 root->defrag_trans_start = 0;
1232         init_completion(&root->kobj_unregister);
1233         root->defrag_running = 0;
1234         root->root_key.objectid = objectid;
1235         root->anon_dev = 0;
1236
1237         spin_lock_init(&root->root_item_lock);
1238 }
1239
1240 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
1241 {
1242         struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
1243         if (root)
1244                 root->fs_info = fs_info;
1245         return root;
1246 }
1247
1248 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
1249 /* Should only be used by the testing infrastructure */
1250 struct btrfs_root *btrfs_alloc_dummy_root(void)
1251 {
1252         struct btrfs_root *root;
1253
1254         root = btrfs_alloc_root(NULL);
1255         if (!root)
1256                 return ERR_PTR(-ENOMEM);
1257         __setup_root(4096, 4096, 4096, 4096, root, NULL, 1);
1258         root->dummy_root = 1;
1259
1260         return root;
1261 }
1262 #endif
1263
1264 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1265                                      struct btrfs_fs_info *fs_info,
1266                                      u64 objectid)
1267 {
1268         struct extent_buffer *leaf;
1269         struct btrfs_root *tree_root = fs_info->tree_root;
1270         struct btrfs_root *root;
1271         struct btrfs_key key;
1272         int ret = 0;
1273         u64 bytenr;
1274         uuid_le uuid;
1275
1276         root = btrfs_alloc_root(fs_info);
1277         if (!root)
1278                 return ERR_PTR(-ENOMEM);
1279
1280         __setup_root(tree_root->nodesize, tree_root->leafsize,
1281                      tree_root->sectorsize, tree_root->stripesize,
1282                      root, fs_info, objectid);
1283         root->root_key.objectid = objectid;
1284         root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1285         root->root_key.offset = 0;
1286
1287         leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
1288                                       0, objectid, NULL, 0, 0, 0);
1289         if (IS_ERR(leaf)) {
1290                 ret = PTR_ERR(leaf);
1291                 leaf = NULL;
1292                 goto fail;
1293         }
1294
1295         bytenr = leaf->start;
1296         memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
1297         btrfs_set_header_bytenr(leaf, leaf->start);
1298         btrfs_set_header_generation(leaf, trans->transid);
1299         btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
1300         btrfs_set_header_owner(leaf, objectid);
1301         root->node = leaf;
1302
1303         write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(),
1304                             BTRFS_FSID_SIZE);
1305         write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
1306                             btrfs_header_chunk_tree_uuid(leaf),
1307                             BTRFS_UUID_SIZE);
1308         btrfs_mark_buffer_dirty(leaf);
1309
1310         root->commit_root = btrfs_root_node(root);
1311         root->track_dirty = 1;
1312
1313
1314         root->root_item.flags = 0;
1315         root->root_item.byte_limit = 0;
1316         btrfs_set_root_bytenr(&root->root_item, leaf->start);
1317         btrfs_set_root_generation(&root->root_item, trans->transid);
1318         btrfs_set_root_level(&root->root_item, 0);
1319         btrfs_set_root_refs(&root->root_item, 1);
1320         btrfs_set_root_used(&root->root_item, leaf->len);
1321         btrfs_set_root_last_snapshot(&root->root_item, 0);
1322         btrfs_set_root_dirid(&root->root_item, 0);
1323         uuid_le_gen(&uuid);
1324         memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
1325         root->root_item.drop_level = 0;
1326
1327         key.objectid = objectid;
1328         key.type = BTRFS_ROOT_ITEM_KEY;
1329         key.offset = 0;
1330         ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
1331         if (ret)
1332                 goto fail;
1333
1334         btrfs_tree_unlock(leaf);
1335
1336         return root;
1337
1338 fail:
1339         if (leaf) {
1340                 btrfs_tree_unlock(leaf);
1341                 free_extent_buffer(leaf);
1342         }
1343         kfree(root);
1344
1345         return ERR_PTR(ret);
1346 }
1347
1348 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1349                                          struct btrfs_fs_info *fs_info)
1350 {
1351         struct btrfs_root *root;
1352         struct btrfs_root *tree_root = fs_info->tree_root;
1353         struct extent_buffer *leaf;
1354
1355         root = btrfs_alloc_root(fs_info);
1356         if (!root)
1357                 return ERR_PTR(-ENOMEM);
1358
1359         __setup_root(tree_root->nodesize, tree_root->leafsize,
1360                      tree_root->sectorsize, tree_root->stripesize,
1361                      root, fs_info, BTRFS_TREE_LOG_OBJECTID);
1362
1363         root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
1364         root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1365         root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1366         /*
1367          * log trees do not get reference counted because they go away
1368          * before a real commit is actually done.  They do store pointers
1369          * to file data extents, and those reference counts still get
1370          * updated (along with back refs to the log tree).
1371          */
1372         root->ref_cows = 0;
1373
1374         leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
1375                                       BTRFS_TREE_LOG_OBJECTID, NULL,
1376                                       0, 0, 0);
1377         if (IS_ERR(leaf)) {
1378                 kfree(root);
1379                 return ERR_CAST(leaf);
1380         }
1381
1382         memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
1383         btrfs_set_header_bytenr(leaf, leaf->start);
1384         btrfs_set_header_generation(leaf, trans->transid);
1385         btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
1386         btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
1387         root->node = leaf;
1388
1389         write_extent_buffer(root->node, root->fs_info->fsid,
1390                             btrfs_header_fsid(), BTRFS_FSID_SIZE);
1391         btrfs_mark_buffer_dirty(root->node);
1392         btrfs_tree_unlock(root->node);
1393         return root;
1394 }
1395
1396 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
1397                              struct btrfs_fs_info *fs_info)
1398 {
1399         struct btrfs_root *log_root;
1400
1401         log_root = alloc_log_tree(trans, fs_info);
1402         if (IS_ERR(log_root))
1403                 return PTR_ERR(log_root);
1404         WARN_ON(fs_info->log_root_tree);
1405         fs_info->log_root_tree = log_root;
1406         return 0;
1407 }
1408
1409 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1410                        struct btrfs_root *root)
1411 {
1412         struct btrfs_root *log_root;
1413         struct btrfs_inode_item *inode_item;
1414
1415         log_root = alloc_log_tree(trans, root->fs_info);
1416         if (IS_ERR(log_root))
1417                 return PTR_ERR(log_root);
1418
1419         log_root->last_trans = trans->transid;
1420         log_root->root_key.offset = root->root_key.objectid;
1421
1422         inode_item = &log_root->root_item.inode;
1423         btrfs_set_stack_inode_generation(inode_item, 1);
1424         btrfs_set_stack_inode_size(inode_item, 3);
1425         btrfs_set_stack_inode_nlink(inode_item, 1);
1426         btrfs_set_stack_inode_nbytes(inode_item, root->leafsize);
1427         btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
1428
1429         btrfs_set_root_node(&log_root->root_item, log_root->node);
1430
1431         WARN_ON(root->log_root);
1432         root->log_root = log_root;
1433         root->log_transid = 0;
1434         root->last_log_commit = 0;
1435         return 0;
1436 }
1437
1438 static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1439                                                struct btrfs_key *key)
1440 {
1441         struct btrfs_root *root;
1442         struct btrfs_fs_info *fs_info = tree_root->fs_info;
1443         struct btrfs_path *path;
1444         u64 generation;
1445         u32 blocksize;
1446         int ret;
1447
1448         path = btrfs_alloc_path();
1449         if (!path)
1450                 return ERR_PTR(-ENOMEM);
1451
1452         root = btrfs_alloc_root(fs_info);
1453         if (!root) {
1454                 ret = -ENOMEM;
1455                 goto alloc_fail;
1456         }
1457
1458         __setup_root(tree_root->nodesize, tree_root->leafsize,
1459                      tree_root->sectorsize, tree_root->stripesize,
1460                      root, fs_info, key->objectid);
1461
1462         ret = btrfs_find_root(tree_root, key, path,
1463                               &root->root_item, &root->root_key);
1464         if (ret) {
1465                 if (ret > 0)
1466                         ret = -ENOENT;
1467                 goto find_fail;
1468         }
1469
1470         generation = btrfs_root_generation(&root->root_item);
1471         blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1472         root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1473                                      blocksize, generation);
1474         if (!root->node) {
1475                 ret = -ENOMEM;
1476                 goto find_fail;
1477         } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
1478                 ret = -EIO;
1479                 goto read_fail;
1480         }
1481         root->commit_root = btrfs_root_node(root);
1482 out:
1483         btrfs_free_path(path);
1484         return root;
1485
1486 read_fail:
1487         free_extent_buffer(root->node);
1488 find_fail:
1489         kfree(root);
1490 alloc_fail:
1491         root = ERR_PTR(ret);
1492         goto out;
1493 }
1494
1495 struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
1496                                       struct btrfs_key *location)
1497 {
1498         struct btrfs_root *root;
1499
1500         root = btrfs_read_tree_root(tree_root, location);
1501         if (IS_ERR(root))
1502                 return root;
1503
1504         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
1505                 root->ref_cows = 1;
1506                 btrfs_check_and_init_root_item(&root->root_item);
1507         }
1508
1509         return root;
1510 }
1511
1512 int btrfs_init_fs_root(struct btrfs_root *root)
1513 {
1514         int ret;
1515
1516         root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1517         root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
1518                                         GFP_NOFS);
1519         if (!root->free_ino_pinned || !root->free_ino_ctl) {
1520                 ret = -ENOMEM;
1521                 goto fail;
1522         }
1523
1524         btrfs_init_free_ino_ctl(root);
1525         mutex_init(&root->fs_commit_mutex);
1526         spin_lock_init(&root->cache_lock);
1527         init_waitqueue_head(&root->cache_wait);
1528
1529         ret = get_anon_bdev(&root->anon_dev);
1530         if (ret)
1531                 goto fail;
1532         return 0;
1533 fail:
1534         kfree(root->free_ino_ctl);
1535         kfree(root->free_ino_pinned);
1536         return ret;
1537 }
1538
1539 static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1540                                                u64 root_id)
1541 {
1542         struct btrfs_root *root;
1543
1544         spin_lock(&fs_info->fs_roots_radix_lock);
1545         root = radix_tree_lookup(&fs_info->fs_roots_radix,
1546                                  (unsigned long)root_id);
1547         spin_unlock(&fs_info->fs_roots_radix_lock);
1548         return root;
1549 }
1550
1551 int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1552                          struct btrfs_root *root)
1553 {
1554         int ret;
1555
1556         ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1557         if (ret)
1558                 return ret;
1559
1560         spin_lock(&fs_info->fs_roots_radix_lock);
1561         ret = radix_tree_insert(&fs_info->fs_roots_radix,
1562                                 (unsigned long)root->root_key.objectid,
1563                                 root);
1564         if (ret == 0)
1565                 root->in_radix = 1;
1566         spin_unlock(&fs_info->fs_roots_radix_lock);
1567         radix_tree_preload_end();
1568
1569         return ret;
1570 }
1571
1572 struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
1573                                      struct btrfs_key *location,
1574                                      bool check_ref)
1575 {
1576         struct btrfs_root *root;
1577         int ret;
1578
1579         if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
1580                 return fs_info->tree_root;
1581         if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
1582                 return fs_info->extent_root;
1583         if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
1584                 return fs_info->chunk_root;
1585         if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
1586                 return fs_info->dev_root;
1587         if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1588                 return fs_info->csum_root;
1589         if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
1590                 return fs_info->quota_root ? fs_info->quota_root :
1591                                              ERR_PTR(-ENOENT);
1592         if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
1593                 return fs_info->uuid_root ? fs_info->uuid_root :
1594                                             ERR_PTR(-ENOENT);
1595 again:
1596         root = btrfs_lookup_fs_root(fs_info, location->objectid);
1597         if (root) {
1598                 if (check_ref && btrfs_root_refs(&root->root_item) == 0)
1599                         return ERR_PTR(-ENOENT);
1600                 return root;
1601         }
1602
1603         root = btrfs_read_fs_root(fs_info->tree_root, location);
1604         if (IS_ERR(root))
1605                 return root;
1606
1607         if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1608                 ret = -ENOENT;
1609                 goto fail;
1610         }
1611
1612         ret = btrfs_init_fs_root(root);
1613         if (ret)
1614                 goto fail;
1615
1616         ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1617         if (ret < 0)
1618                 goto fail;
1619         if (ret == 0)
1620                 root->orphan_item_inserted = 1;
1621
1622         ret = btrfs_insert_fs_root(fs_info, root);
1623         if (ret) {
1624                 if (ret == -EEXIST) {
1625                         free_fs_root(root);
1626                         goto again;
1627                 }
1628                 goto fail;
1629         }
1630         return root;
1631 fail:
1632         free_fs_root(root);
1633         return ERR_PTR(ret);
1634 }
1635
1636 static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1637 {
1638         struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1639         int ret = 0;
1640         struct btrfs_device *device;
1641         struct backing_dev_info *bdi;
1642
1643         rcu_read_lock();
1644         list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
1645                 if (!device->bdev)
1646                         continue;
1647                 bdi = blk_get_backing_dev_info(device->bdev);
1648                 if (bdi && bdi_congested(bdi, bdi_bits)) {
1649                         ret = 1;
1650                         break;
1651                 }
1652         }
1653         rcu_read_unlock();
1654         return ret;
1655 }
1656
1657 /*
1658  * If this fails, caller must call bdi_destroy() to get rid of the
1659  * bdi again.
1660  */
1661 static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1662 {
1663         int err;
1664
1665         bdi->capabilities = BDI_CAP_MAP_COPY;
1666         err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
1667         if (err)
1668                 return err;
1669
1670         bdi->ra_pages   = default_backing_dev_info.ra_pages;
1671         bdi->congested_fn       = btrfs_congested_fn;
1672         bdi->congested_data     = info;
1673         return 0;
1674 }
1675
1676 /*
1677  * called by the kthread helper functions to finally call the bio end_io
1678  * functions.  This is where read checksum verification actually happens
1679  */
1680 static void end_workqueue_fn(struct btrfs_work *work)
1681 {
1682         struct bio *bio;
1683         struct end_io_wq *end_io_wq;
1684         struct btrfs_fs_info *fs_info;
1685         int error;
1686
1687         end_io_wq = container_of(work, struct end_io_wq, work);
1688         bio = end_io_wq->bio;
1689         fs_info = end_io_wq->info;
1690
1691         error = end_io_wq->error;
1692         bio->bi_private = end_io_wq->private;
1693         bio->bi_end_io = end_io_wq->end_io;
1694         kfree(end_io_wq);
1695         bio_endio_nodec(bio, error);
1696 }
1697
1698 static int cleaner_kthread(void *arg)
1699 {
1700         struct btrfs_root *root = arg;
1701         int again;
1702
1703         do {
1704                 again = 0;
1705
1706                 /* Make the cleaner go to sleep early. */
1707                 if (btrfs_need_cleaner_sleep(root))
1708                         goto sleep;
1709
1710                 if (!mutex_trylock(&root->fs_info->cleaner_mutex))
1711                         goto sleep;
1712
1713                 /*
1714                  * Avoid the problem that we change the status of the fs
1715                  * during the above check and trylock.
1716                  */
1717                 if (btrfs_need_cleaner_sleep(root)) {
1718                         mutex_unlock(&root->fs_info->cleaner_mutex);
1719                         goto sleep;
1720                 }
1721
1722                 btrfs_run_delayed_iputs(root);
1723                 again = btrfs_clean_one_deleted_snapshot(root);
1724                 mutex_unlock(&root->fs_info->cleaner_mutex);
1725
1726                 /*
1727                  * The defragger has dealt with the R/O remount and umount,
1728                  * needn't do anything special here.
1729                  */
1730                 btrfs_run_defrag_inodes(root->fs_info);
1731 sleep:
1732                 if (!try_to_freeze() && !again) {
1733                         set_current_state(TASK_INTERRUPTIBLE);
1734                         if (!kthread_should_stop())
1735                                 schedule();
1736                         __set_current_state(TASK_RUNNING);
1737                 }
1738         } while (!kthread_should_stop());
1739         return 0;
1740 }
1741
1742 static int transaction_kthread(void *arg)
1743 {
1744         struct btrfs_root *root = arg;
1745         struct btrfs_trans_handle *trans;
1746         struct btrfs_transaction *cur;
1747         u64 transid;
1748         unsigned long now;
1749         unsigned long delay;
1750         bool cannot_commit;
1751
1752         do {
1753                 cannot_commit = false;
1754                 delay = HZ * root->fs_info->commit_interval;
1755                 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1756
1757                 spin_lock(&root->fs_info->trans_lock);
1758                 cur = root->fs_info->running_transaction;
1759                 if (!cur) {
1760                         spin_unlock(&root->fs_info->trans_lock);
1761                         goto sleep;
1762                 }
1763
1764                 now = get_seconds();
1765                 if (cur->state < TRANS_STATE_BLOCKED &&
1766                     (now < cur->start_time ||
1767                      now - cur->start_time < root->fs_info->commit_interval)) {
1768                         spin_unlock(&root->fs_info->trans_lock);
1769                         delay = HZ * 5;
1770                         goto sleep;
1771                 }
1772                 transid = cur->transid;
1773                 spin_unlock(&root->fs_info->trans_lock);
1774
1775                 /* If the file system is aborted, this will always fail. */
1776                 trans = btrfs_attach_transaction(root);
1777                 if (IS_ERR(trans)) {
1778                         if (PTR_ERR(trans) != -ENOENT)
1779                                 cannot_commit = true;
1780                         goto sleep;
1781                 }
1782                 if (transid == trans->transid) {
1783                         btrfs_commit_transaction(trans, root);
1784                 } else {
1785                         btrfs_end_transaction(trans, root);
1786                 }
1787 sleep:
1788                 wake_up_process(root->fs_info->cleaner_kthread);
1789                 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
1790
1791                 if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
1792                                       &root->fs_info->fs_state)))
1793                         btrfs_cleanup_transaction(root);
1794                 if (!try_to_freeze()) {
1795                         set_current_state(TASK_INTERRUPTIBLE);
1796                         if (!kthread_should_stop() &&
1797                             (!btrfs_transaction_blocked(root->fs_info) ||
1798                              cannot_commit))
1799                                 schedule_timeout(delay);
1800                         __set_current_state(TASK_RUNNING);
1801                 }
1802         } while (!kthread_should_stop());
1803         return 0;
1804 }
1805
1806 /*
1807  * this will find the highest generation in the array of
1808  * root backups.  The index of the highest array is returned,
1809  * or -1 if we can't find anything.
1810  *
1811  * We check to make sure the array is valid by comparing the
1812  * generation of the latest  root in the array with the generation
1813  * in the super block.  If they don't match we pitch it.
1814  */
1815 static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
1816 {
1817         u64 cur;
1818         int newest_index = -1;
1819         struct btrfs_root_backup *root_backup;
1820         int i;
1821
1822         for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1823                 root_backup = info->super_copy->super_roots + i;
1824                 cur = btrfs_backup_tree_root_gen(root_backup);
1825                 if (cur == newest_gen)
1826                         newest_index = i;
1827         }
1828
1829         /* check to see if we actually wrapped around */
1830         if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
1831                 root_backup = info->super_copy->super_roots;
1832                 cur = btrfs_backup_tree_root_gen(root_backup);
1833                 if (cur == newest_gen)
1834                         newest_index = 0;
1835         }
1836         return newest_index;
1837 }
1838
1839
1840 /*
1841  * find the oldest backup so we know where to store new entries
1842  * in the backup array.  This will set the backup_root_index
1843  * field in the fs_info struct
1844  */
1845 static void find_oldest_super_backup(struct btrfs_fs_info *info,
1846                                      u64 newest_gen)
1847 {
1848         int newest_index = -1;
1849
1850         newest_index = find_newest_super_backup(info, newest_gen);
1851         /* if there was garbage in there, just move along */
1852         if (newest_index == -1) {
1853                 info->backup_root_index = 0;
1854         } else {
1855                 info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
1856         }
1857 }
1858
1859 /*
1860  * copy all the root pointers into the super backup array.
1861  * this will bump the backup pointer by one when it is
1862  * done
1863  */
1864 static void backup_super_roots(struct btrfs_fs_info *info)
1865 {
1866         int next_backup;
1867         struct btrfs_root_backup *root_backup;
1868         int last_backup;
1869
1870         next_backup = info->backup_root_index;
1871         last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
1872                 BTRFS_NUM_BACKUP_ROOTS;
1873
1874         /*
1875          * just overwrite the last backup if we're at the same generation
1876          * this happens only at umount
1877          */
1878         root_backup = info->super_for_commit->super_roots + last_backup;
1879         if (btrfs_backup_tree_root_gen(root_backup) ==
1880             btrfs_header_generation(info->tree_root->node))
1881                 next_backup = last_backup;
1882
1883         root_backup = info->super_for_commit->super_roots + next_backup;
1884
1885         /*
1886          * make sure all of our padding and empty slots get zero filled
1887          * regardless of which ones we use today
1888          */
1889         memset(root_backup, 0, sizeof(*root_backup));
1890
1891         info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
1892
1893         btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
1894         btrfs_set_backup_tree_root_gen(root_backup,
1895                                btrfs_header_generation(info->tree_root->node));
1896
1897         btrfs_set_backup_tree_root_level(root_backup,
1898                                btrfs_header_level(info->tree_root->node));
1899
1900         btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
1901         btrfs_set_backup_chunk_root_gen(root_backup,
1902                                btrfs_header_generation(info->chunk_root->node));
1903         btrfs_set_backup_chunk_root_level(root_backup,
1904                                btrfs_header_level(info->chunk_root->node));
1905
1906         btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
1907         btrfs_set_backup_extent_root_gen(root_backup,
1908                                btrfs_header_generation(info->extent_root->node));
1909         btrfs_set_backup_extent_root_level(root_backup,
1910                                btrfs_header_level(info->extent_root->node));
1911
1912         /*
1913          * we might commit during log recovery, which happens before we set
1914          * the fs_root.  Make sure it is valid before we fill it in.
1915          */
1916         if (info->fs_root && info->fs_root->node) {
1917                 btrfs_set_backup_fs_root(root_backup,
1918                                          info->fs_root->node->start);
1919                 btrfs_set_backup_fs_root_gen(root_backup,
1920                                btrfs_header_generation(info->fs_root->node));
1921                 btrfs_set_backup_fs_root_level(root_backup,
1922                                btrfs_header_level(info->fs_root->node));
1923         }
1924
1925         btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
1926         btrfs_set_backup_dev_root_gen(root_backup,
1927                                btrfs_header_generation(info->dev_root->node));
1928         btrfs_set_backup_dev_root_level(root_backup,
1929                                        btrfs_header_level(info->dev_root->node));
1930
1931         btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
1932         btrfs_set_backup_csum_root_gen(root_backup,
1933                                btrfs_header_generation(info->csum_root->node));
1934         btrfs_set_backup_csum_root_level(root_backup,
1935                                btrfs_header_level(info->csum_root->node));
1936
1937         btrfs_set_backup_total_bytes(root_backup,
1938                              btrfs_super_total_bytes(info->super_copy));
1939         btrfs_set_backup_bytes_used(root_backup,
1940                              btrfs_super_bytes_used(info->super_copy));
1941         btrfs_set_backup_num_devices(root_backup,
1942                              btrfs_super_num_devices(info->super_copy));
1943
1944         /*
1945          * if we don't copy this out to the super_copy, it won't get remembered
1946          * for the next commit
1947          */
1948         memcpy(&info->super_copy->super_roots,
1949                &info->super_for_commit->super_roots,
1950                sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
1951 }
1952
1953 /*
1954  * this copies info out of the root backup array and back into
1955  * the in-memory super block.  It is meant to help iterate through
1956  * the array, so you send it the number of backups you've already
1957  * tried and the last backup index you used.
1958  *
1959  * this returns -1 when it has tried all the backups
1960  */
1961 static noinline int next_root_backup(struct btrfs_fs_info *info,
1962                                      struct btrfs_super_block *super,
1963                                      int *num_backups_tried, int *backup_index)
1964 {
1965         struct btrfs_root_backup *root_backup;
1966         int newest = *backup_index;
1967
1968         if (*num_backups_tried == 0) {
1969                 u64 gen = btrfs_super_generation(super);
1970
1971                 newest = find_newest_super_backup(info, gen);
1972                 if (newest == -1)
1973                         return -1;
1974
1975                 *backup_index = newest;
1976                 *num_backups_tried = 1;
1977         } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
1978                 /* we've tried all the backups, all done */
1979                 return -1;
1980         } else {
1981                 /* jump to the next oldest backup */
1982                 newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
1983                         BTRFS_NUM_BACKUP_ROOTS;
1984                 *backup_index = newest;
1985                 *num_backups_tried += 1;
1986         }
1987         root_backup = super->super_roots + newest;
1988
1989         btrfs_set_super_generation(super,
1990                                    btrfs_backup_tree_root_gen(root_backup));
1991         btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
1992         btrfs_set_super_root_level(super,
1993                                    btrfs_backup_tree_root_level(root_backup));
1994         btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
1995
1996         /*
1997          * fixme: the total bytes and num_devices need to match or we should
1998          * need a fsck
1999          */
2000         btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
2001         btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
2002         return 0;
2003 }
2004
2005 /* helper to cleanup workers */
2006 static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
2007 {
2008         btrfs_stop_workers(&fs_info->generic_worker);
2009         btrfs_stop_workers(&fs_info->fixup_workers);
2010         btrfs_stop_workers(&fs_info->delalloc_workers);
2011         btrfs_stop_workers(&fs_info->workers);
2012         btrfs_stop_workers(&fs_info->endio_workers);
2013         btrfs_stop_workers(&fs_info->endio_meta_workers);
2014         btrfs_stop_workers(&fs_info->endio_raid56_workers);
2015         btrfs_stop_workers(&fs_info->rmw_workers);
2016         btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2017         btrfs_stop_workers(&fs_info->endio_write_workers);
2018         btrfs_stop_workers(&fs_info->endio_freespace_worker);
2019         btrfs_stop_workers(&fs_info->submit_workers);
2020         btrfs_stop_workers(&fs_info->delayed_workers);
2021         btrfs_stop_workers(&fs_info->caching_workers);
2022         btrfs_stop_workers(&fs_info->readahead_workers);
2023         btrfs_stop_workers(&fs_info->flush_workers);
2024         btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
2025 }
2026
2027 static void free_root_extent_buffers(struct btrfs_root *root)
2028 {
2029         if (root) {
2030                 free_extent_buffer(root->node);
2031                 free_extent_buffer(root->commit_root);
2032                 root->node = NULL;
2033                 root->commit_root = NULL;
2034         }
2035 }
2036
2037 /* helper to cleanup tree roots */
2038 static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
2039 {
2040         free_root_extent_buffers(info->tree_root);
2041
2042         free_root_extent_buffers(info->dev_root);
2043         free_root_extent_buffers(info->extent_root);
2044         free_root_extent_buffers(info->csum_root);
2045         free_root_extent_buffers(info->quota_root);
2046         free_root_extent_buffers(info->uuid_root);
2047         if (chunk_root)
2048                 free_root_extent_buffers(info->chunk_root);
2049 }
2050
2051 static void del_fs_roots(struct btrfs_fs_info *fs_info)
2052 {
2053         int ret;
2054         struct btrfs_root *gang[8];
2055         int i;
2056
2057         while (!list_empty(&fs_info->dead_roots)) {
2058                 gang[0] = list_entry(fs_info->dead_roots.next,
2059                                      struct btrfs_root, root_list);
2060                 list_del(&gang[0]->root_list);
2061
2062                 if (gang[0]->in_radix) {
2063                         btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2064                 } else {
2065                         free_extent_buffer(gang[0]->node);
2066                         free_extent_buffer(gang[0]->commit_root);
2067                         btrfs_put_fs_root(gang[0]);
2068                 }
2069         }
2070
2071         while (1) {
2072                 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2073                                              (void **)gang, 0,
2074                                              ARRAY_SIZE(gang));
2075                 if (!ret)
2076                         break;
2077                 for (i = 0; i < ret; i++)
2078                         btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2079         }
2080 }
2081
2082 int open_ctree(struct super_block *sb,
2083                struct btrfs_fs_devices *fs_devices,
2084                char *options)
2085 {
2086         u32 sectorsize;
2087         u32 nodesize;
2088         u32 leafsize;
2089         u32 blocksize;
2090         u32 stripesize;
2091         u64 generation;
2092         u64 features;
2093         struct btrfs_key location;
2094         struct buffer_head *bh;
2095         struct btrfs_super_block *disk_super;
2096         struct btrfs_fs_info *fs_info = btrfs_sb(sb);
2097         struct btrfs_root *tree_root;
2098         struct btrfs_root *extent_root;
2099         struct btrfs_root *csum_root;
2100         struct btrfs_root *chunk_root;
2101         struct btrfs_root *dev_root;
2102         struct btrfs_root *quota_root;
2103         struct btrfs_root *uuid_root;
2104         struct btrfs_root *log_tree_root;
2105         int ret;
2106         int err = -EINVAL;
2107         int num_backups_tried = 0;
2108         int backup_index = 0;
2109         bool create_uuid_tree;
2110         bool check_uuid_tree;
2111
2112         tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
2113         chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
2114         if (!tree_root || !chunk_root) {
2115                 err = -ENOMEM;
2116                 goto fail;
2117         }
2118
2119         ret = init_srcu_struct(&fs_info->subvol_srcu);
2120         if (ret) {
2121                 err = ret;
2122                 goto fail;
2123         }
2124
2125         ret = setup_bdi(fs_info, &fs_info->bdi);
2126         if (ret) {
2127                 err = ret;
2128                 goto fail_srcu;
2129         }
2130
2131         ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
2132         if (ret) {
2133                 err = ret;
2134                 goto fail_bdi;
2135         }
2136         fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
2137                                         (1 + ilog2(nr_cpu_ids));
2138
2139         ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
2140         if (ret) {
2141                 err = ret;
2142                 goto fail_dirty_metadata_bytes;
2143         }
2144
2145         fs_info->btree_inode = new_inode(sb);
2146         if (!fs_info->btree_inode) {
2147                 err = -ENOMEM;
2148                 goto fail_delalloc_bytes;
2149         }
2150
2151         mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
2152
2153         INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2154         INIT_LIST_HEAD(&fs_info->trans_list);
2155         INIT_LIST_HEAD(&fs_info->dead_roots);
2156         INIT_LIST_HEAD(&fs_info->delayed_iputs);
2157         INIT_LIST_HEAD(&fs_info->delalloc_roots);
2158         INIT_LIST_HEAD(&fs_info->caching_block_groups);
2159         spin_lock_init(&fs_info->delalloc_root_lock);
2160         spin_lock_init(&fs_info->trans_lock);
2161         spin_lock_init(&fs_info->fs_roots_radix_lock);
2162         spin_lock_init(&fs_info->delayed_iput_lock);
2163         spin_lock_init(&fs_info->defrag_inodes_lock);
2164         spin_lock_init(&fs_info->free_chunk_lock);
2165         spin_lock_init(&fs_info->tree_mod_seq_lock);
2166         spin_lock_init(&fs_info->super_lock);
2167         rwlock_init(&fs_info->tree_mod_log_lock);
2168         mutex_init(&fs_info->reloc_mutex);
2169         seqlock_init(&fs_info->profiles_lock);
2170
2171         init_completion(&fs_info->kobj_unregister);
2172         INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
2173         INIT_LIST_HEAD(&fs_info->space_info);
2174         INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2175         btrfs_mapping_init(&fs_info->mapping_tree);
2176         btrfs_init_block_rsv(&fs_info->global_block_rsv,
2177                              BTRFS_BLOCK_RSV_GLOBAL);
2178         btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
2179                              BTRFS_BLOCK_RSV_DELALLOC);
2180         btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
2181         btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
2182         btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
2183         btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
2184                              BTRFS_BLOCK_RSV_DELOPS);
2185         atomic_set(&fs_info->nr_async_submits, 0);
2186         atomic_set(&fs_info->async_delalloc_pages, 0);
2187         atomic_set(&fs_info->async_submit_draining, 0);
2188         atomic_set(&fs_info->nr_async_bios, 0);
2189         atomic_set(&fs_info->defrag_running, 0);
2190         atomic64_set(&fs_info->tree_mod_seq, 0);
2191         fs_info->sb = sb;
2192         fs_info->max_inline = 8192 * 1024;
2193         fs_info->metadata_ratio = 0;
2194         fs_info->defrag_inodes = RB_ROOT;
2195         fs_info->free_chunk_space = 0;
2196         fs_info->tree_mod_log = RB_ROOT;
2197         fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
2198
2199         /* readahead state */
2200         INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
2201         spin_lock_init(&fs_info->reada_lock);
2202
2203         fs_info->thread_pool_size = min_t(unsigned long,
2204                                           num_online_cpus() + 2, 8);
2205
2206         INIT_LIST_HEAD(&fs_info->ordered_roots);
2207         spin_lock_init(&fs_info->ordered_root_lock);
2208         fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2209                                         GFP_NOFS);
2210         if (!fs_info->delayed_root) {
2211                 err = -ENOMEM;
2212                 goto fail_iput;
2213         }
2214         btrfs_init_delayed_root(fs_info->delayed_root);
2215
2216         mutex_init(&fs_info->scrub_lock);
2217         atomic_set(&fs_info->scrubs_running, 0);
2218         atomic_set(&fs_info->scrub_pause_req, 0);
2219         atomic_set(&fs_info->scrubs_paused, 0);
2220         atomic_set(&fs_info->scrub_cancel_req, 0);
2221         init_waitqueue_head(&fs_info->scrub_pause_wait);
2222         fs_info->scrub_workers_refcnt = 0;
2223 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2224         fs_info->check_integrity_print_mask = 0;
2225 #endif
2226
2227         spin_lock_init(&fs_info->balance_lock);
2228         mutex_init(&fs_info->balance_mutex);
2229         atomic_set(&fs_info->balance_running, 0);
2230         atomic_set(&fs_info->balance_pause_req, 0);
2231         atomic_set(&fs_info->balance_cancel_req, 0);
2232         fs_info->balance_ctl = NULL;
2233         init_waitqueue_head(&fs_info->balance_wait_q);
2234
2235         sb->s_blocksize = 4096;
2236         sb->s_blocksize_bits = blksize_bits(4096);
2237         sb->s_bdi = &fs_info->bdi;
2238
2239         fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
2240         set_nlink(fs_info->btree_inode, 1);
2241         /*
2242          * we set the i_size on the btree inode to the max possible int.
2243          * the real end of the address space is determined by all of
2244          * the devices in the system
2245          */
2246         fs_info->btree_inode->i_size = OFFSET_MAX;
2247         fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
2248         fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
2249
2250         RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
2251         extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
2252                              fs_info->btree_inode->i_mapping);
2253         BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
2254         extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
2255
2256         BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
2257
2258         BTRFS_I(fs_info->btree_inode)->root = tree_root;
2259         memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
2260                sizeof(struct btrfs_key));
2261         set_bit(BTRFS_INODE_DUMMY,
2262                 &BTRFS_I(fs_info->btree_inode)->runtime_flags);
2263         btrfs_insert_inode_hash(fs_info->btree_inode);
2264
2265         spin_lock_init(&fs_info->block_group_cache_lock);
2266         fs_info->block_group_cache_tree = RB_ROOT;
2267         fs_info->first_logical_byte = (u64)-1;
2268
2269         extent_io_tree_init(&fs_info->freed_extents[0],
2270                              fs_info->btree_inode->i_mapping);
2271         extent_io_tree_init(&fs_info->freed_extents[1],
2272                              fs_info->btree_inode->i_mapping);
2273         fs_info->pinned_extents = &fs_info->freed_extents[0];
2274         fs_info->do_barriers = 1;
2275
2276
2277         mutex_init(&fs_info->ordered_operations_mutex);
2278         mutex_init(&fs_info->ordered_extent_flush_mutex);
2279         mutex_init(&fs_info->tree_log_mutex);
2280         mutex_init(&fs_info->chunk_mutex);
2281         mutex_init(&fs_info->transaction_kthread_mutex);
2282         mutex_init(&fs_info->cleaner_mutex);
2283         mutex_init(&fs_info->volume_mutex);
2284         init_rwsem(&fs_info->extent_commit_sem);
2285         init_rwsem(&fs_info->cleanup_work_sem);
2286         init_rwsem(&fs_info->subvol_sem);
2287         sema_init(&fs_info->uuid_tree_rescan_sem, 1);
2288         fs_info->dev_replace.lock_owner = 0;
2289         atomic_set(&fs_info->dev_replace.nesting_level, 0);
2290         mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2291         mutex_init(&fs_info->dev_replace.lock_management_lock);
2292         mutex_init(&fs_info->dev_replace.lock);
2293
2294         spin_lock_init(&fs_info->qgroup_lock);
2295         mutex_init(&fs_info->qgroup_ioctl_lock);
2296         fs_info->qgroup_tree = RB_ROOT;
2297         INIT_LIST_HEAD(&fs_info->dirty_qgroups);
2298         fs_info->qgroup_seq = 1;
2299         fs_info->quota_enabled = 0;
2300         fs_info->pending_quota_state = 0;
2301         fs_info->qgroup_ulist = NULL;
2302         mutex_init(&fs_info->qgroup_rescan_lock);
2303
2304         btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
2305         btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
2306
2307         init_waitqueue_head(&fs_info->transaction_throttle);
2308         init_waitqueue_head(&fs_info->transaction_wait);
2309         init_waitqueue_head(&fs_info->transaction_blocked_wait);
2310         init_waitqueue_head(&fs_info->async_submit_wait);
2311
2312         ret = btrfs_alloc_stripe_hash_table(fs_info);
2313         if (ret) {
2314                 err = ret;
2315                 goto fail_alloc;
2316         }
2317
2318         __setup_root(4096, 4096, 4096, 4096, tree_root,
2319                      fs_info, BTRFS_ROOT_TREE_OBJECTID);
2320
2321         invalidate_bdev(fs_devices->latest_bdev);
2322
2323         /*
2324          * Read super block and check the signature bytes only
2325          */
2326         bh = btrfs_read_dev_super(fs_devices->latest_bdev);
2327         if (!bh) {
2328                 err = -EINVAL;
2329                 goto fail_alloc;
2330         }
2331
2332         /*
2333          * We want to check superblock checksum, the type is stored inside.
2334          * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
2335          */
2336         if (btrfs_check_super_csum(bh->b_data)) {
2337                 printk(KERN_ERR "btrfs: superblock checksum mismatch\n");
2338                 err = -EINVAL;
2339                 goto fail_alloc;
2340         }
2341
2342         /*
2343          * super_copy is zeroed at allocation time and we never touch the
2344          * following bytes up to INFO_SIZE, the checksum is calculated from
2345          * the whole block of INFO_SIZE
2346          */
2347         memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
2348         memcpy(fs_info->super_for_commit, fs_info->super_copy,
2349                sizeof(*fs_info->super_for_commit));
2350         brelse(bh);
2351
2352         memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
2353
2354         ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
2355         if (ret) {
2356                 printk(KERN_ERR "btrfs: superblock contains fatal errors\n");
2357                 err = -EINVAL;
2358                 goto fail_alloc;
2359         }
2360
2361         disk_super = fs_info->super_copy;
2362         if (!btrfs_super_root(disk_super))
2363                 goto fail_alloc;
2364
2365         /* check FS state, whether FS is broken. */
2366         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
2367                 set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
2368
2369         /*
2370          * run through our array of backup supers and setup
2371          * our ring pointer to the oldest one
2372          */
2373         generation = btrfs_super_generation(disk_super);
2374         find_oldest_super_backup(fs_info, generation);
2375
2376         /*
2377          * In the long term, we'll store the compression type in the super
2378          * block, and it'll be used for per file compression control.
2379          */
2380         fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
2381
2382         ret = btrfs_parse_options(tree_root, options);
2383         if (ret) {
2384                 err = ret;
2385                 goto fail_alloc;
2386         }
2387
2388         features = btrfs_super_incompat_flags(disk_super) &
2389                 ~BTRFS_FEATURE_INCOMPAT_SUPP;
2390         if (features) {
2391                 printk(KERN_ERR "BTRFS: couldn't mount because of "
2392                        "unsupported optional features (%Lx).\n",
2393                        features);
2394                 err = -EINVAL;
2395                 goto fail_alloc;
2396         }
2397
2398         if (btrfs_super_leafsize(disk_super) !=
2399             btrfs_super_nodesize(disk_super)) {
2400                 printk(KERN_ERR "BTRFS: couldn't mount because metadata "
2401                        "blocksizes don't match.  node %d leaf %d\n",
2402                        btrfs_super_nodesize(disk_super),
2403                        btrfs_super_leafsize(disk_super));
2404                 err = -EINVAL;
2405                 goto fail_alloc;
2406         }
2407         if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
2408                 printk(KERN_ERR "BTRFS: couldn't mount because metadata "
2409                        "blocksize (%d) was too large\n",
2410                        btrfs_super_leafsize(disk_super));
2411                 err = -EINVAL;
2412                 goto fail_alloc;
2413         }
2414
2415         features = btrfs_super_incompat_flags(disk_super);
2416         features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
2417         if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
2418                 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
2419
2420         if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
2421                 printk(KERN_ERR "btrfs: has skinny extents\n");
2422
2423         /*
2424          * flag our filesystem as having big metadata blocks if
2425          * they are bigger than the page size
2426          */
2427         if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
2428                 if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
2429                         printk(KERN_INFO "btrfs flagging fs with big metadata feature\n");
2430                 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
2431         }
2432
2433         nodesize = btrfs_super_nodesize(disk_super);
2434         leafsize = btrfs_super_leafsize(disk_super);
2435         sectorsize = btrfs_super_sectorsize(disk_super);
2436         stripesize = btrfs_super_stripesize(disk_super);
2437         fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
2438         fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
2439
2440         /*
2441          * mixed block groups end up with duplicate but slightly offset
2442          * extent buffers for the same range.  It leads to corruptions
2443          */
2444         if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2445             (sectorsize != leafsize)) {
2446                 printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes "
2447                                 "are not allowed for mixed block groups on %s\n",
2448                                 sb->s_id);
2449                 goto fail_alloc;
2450         }
2451
2452         /*
2453          * Needn't use the lock because there is no other task which will
2454          * update the flag.
2455          */
2456         btrfs_set_super_incompat_flags(disk_super, features);
2457
2458         features = btrfs_super_compat_ro_flags(disk_super) &
2459                 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
2460         if (!(sb->s_flags & MS_RDONLY) && features) {
2461                 printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
2462                        "unsupported option features (%Lx).\n",
2463                        features);
2464                 err = -EINVAL;
2465                 goto fail_alloc;
2466         }
2467
2468         btrfs_init_workers(&fs_info->generic_worker,
2469                            "genwork", 1, NULL);
2470
2471         btrfs_init_workers(&fs_info->workers, "worker",
2472                            fs_info->thread_pool_size,
2473                            &fs_info->generic_worker);
2474
2475         btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
2476                            fs_info->thread_pool_size, NULL);
2477
2478         btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
2479                            fs_info->thread_pool_size, NULL);
2480
2481         btrfs_init_workers(&fs_info->submit_workers, "submit",
2482                            min_t(u64, fs_devices->num_devices,
2483                            fs_info->thread_pool_size), NULL);
2484
2485         btrfs_init_workers(&fs_info->caching_workers, "cache",
2486                            fs_info->thread_pool_size, NULL);
2487
2488         /* a higher idle thresh on the submit workers makes it much more
2489          * likely that bios will be send down in a sane order to the
2490          * devices
2491          */
2492         fs_info->submit_workers.idle_thresh = 64;
2493
2494         fs_info->workers.idle_thresh = 16;
2495         fs_info->workers.ordered = 1;
2496
2497         fs_info->delalloc_workers.idle_thresh = 2;
2498         fs_info->delalloc_workers.ordered = 1;
2499
2500         btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
2501                            &fs_info->generic_worker);
2502         btrfs_init_workers(&fs_info->endio_workers, "endio",
2503                            fs_info->thread_pool_size,
2504                            &fs_info->generic_worker);
2505         btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
2506                            fs_info->thread_pool_size,
2507                            &fs_info->generic_worker);
2508         btrfs_init_workers(&fs_info->endio_meta_write_workers,
2509                            "endio-meta-write", fs_info->thread_pool_size,
2510                            &fs_info->generic_worker);
2511         btrfs_init_workers(&fs_info->endio_raid56_workers,
2512                            "endio-raid56", fs_info->thread_pool_size,
2513                            &fs_info->generic_worker);
2514         btrfs_init_workers(&fs_info->rmw_workers,
2515                            "rmw", fs_info->thread_pool_size,
2516                            &fs_info->generic_worker);
2517         btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
2518                            fs_info->thread_pool_size,
2519                            &fs_info->generic_worker);
2520         btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
2521                            1, &fs_info->generic_worker);
2522         btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
2523                            fs_info->thread_pool_size,
2524                            &fs_info->generic_worker);
2525         btrfs_init_workers(&fs_info->readahead_workers, "readahead",
2526                            fs_info->thread_pool_size,
2527                            &fs_info->generic_worker);
2528         btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
2529                            &fs_info->generic_worker);
2530
2531         /*
2532          * endios are largely parallel and should have a very
2533          * low idle thresh
2534          */
2535         fs_info->endio_workers.idle_thresh = 4;
2536         fs_info->endio_meta_workers.idle_thresh = 4;
2537         fs_info->endio_raid56_workers.idle_thresh = 4;
2538         fs_info->rmw_workers.idle_thresh = 2;
2539
2540         fs_info->endio_write_workers.idle_thresh = 2;
2541         fs_info->endio_meta_write_workers.idle_thresh = 2;
2542         fs_info->readahead_workers.idle_thresh = 2;
2543
2544         /*
2545          * btrfs_start_workers can really only fail because of ENOMEM so just
2546          * return -ENOMEM if any of these fail.
2547          */
2548         ret = btrfs_start_workers(&fs_info->workers);
2549         ret |= btrfs_start_workers(&fs_info->generic_worker);
2550         ret |= btrfs_start_workers(&fs_info->submit_workers);
2551         ret |= btrfs_start_workers(&fs_info->delalloc_workers);
2552         ret |= btrfs_start_workers(&fs_info->fixup_workers);
2553         ret |= btrfs_start_workers(&fs_info->endio_workers);
2554         ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
2555         ret |= btrfs_start_workers(&fs_info->rmw_workers);
2556         ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
2557         ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
2558         ret |= btrfs_start_workers(&fs_info->endio_write_workers);
2559         ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
2560         ret |= btrfs_start_workers(&fs_info->delayed_workers);
2561         ret |= btrfs_start_workers(&fs_info->caching_workers);
2562         ret |= btrfs_start_workers(&fs_info->readahead_workers);
2563         ret |= btrfs_start_workers(&fs_info->flush_workers);
2564         ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers);
2565         if (ret) {
2566                 err = -ENOMEM;
2567                 goto fail_sb_buffer;
2568         }
2569
2570         fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
2571         fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
2572                                     4 * 1024 * 1024 / PAGE_CACHE_SIZE);
2573
2574         tree_root->nodesize = nodesize;
2575         tree_root->leafsize = leafsize;
2576         tree_root->sectorsize = sectorsize;
2577         tree_root->stripesize = stripesize;
2578
2579         sb->s_blocksize = sectorsize;
2580         sb->s_blocksize_bits = blksize_bits(sectorsize);
2581
2582         if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
2583                 printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
2584                 goto fail_sb_buffer;
2585         }
2586
2587         if (sectorsize != PAGE_SIZE) {
2588                 printk(KERN_WARNING "btrfs: Incompatible sector size(%lu) "
2589                        "found on %s\n", (unsigned long)sectorsize, sb->s_id);
2590                 goto fail_sb_buffer;
2591         }
2592
2593         mutex_lock(&fs_info->chunk_mutex);
2594         ret = btrfs_read_sys_array(tree_root);
2595         mutex_unlock(&fs_info->chunk_mutex);
2596         if (ret) {
2597                 printk(KERN_WARNING "btrfs: failed to read the system "
2598                        "array on %s\n", sb->s_id);
2599                 goto fail_sb_buffer;
2600         }
2601
2602         blocksize = btrfs_level_size(tree_root,
2603                                      btrfs_super_chunk_root_level(disk_super));
2604         generation = btrfs_super_chunk_root_generation(disk_super);
2605
2606         __setup_root(nodesize, leafsize, sectorsize, stripesize,
2607                      chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
2608
2609         chunk_root->node = read_tree_block(chunk_root,
2610                                            btrfs_super_chunk_root(disk_super),
2611                                            blocksize, generation);
2612         if (!chunk_root->node ||
2613             !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
2614                 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
2615                        sb->s_id);
2616                 goto fail_tree_roots;
2617         }
2618         btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
2619         chunk_root->commit_root = btrfs_root_node(chunk_root);
2620
2621         read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
2622            btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE);
2623
2624         ret = btrfs_read_chunk_tree(chunk_root);
2625         if (ret) {
2626                 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
2627                        sb->s_id);
2628                 goto fail_tree_roots;
2629         }
2630
2631         /*
2632          * keep the device that is marked to be the target device for the
2633          * dev_replace procedure
2634          */
2635         btrfs_close_extra_devices(fs_info, fs_devices, 0);
2636
2637         if (!fs_devices->latest_bdev) {
2638                 printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
2639                        sb->s_id);
2640                 goto fail_tree_roots;
2641         }
2642
2643 retry_root_backup:
2644         blocksize = btrfs_level_size(tree_root,
2645                                      btrfs_super_root_level(disk_super));
2646         generation = btrfs_super_generation(disk_super);
2647
2648         tree_root->node = read_tree_block(tree_root,
2649                                           btrfs_super_root(disk_super),
2650                                           blocksize, generation);
2651         if (!tree_root->node ||
2652             !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
2653                 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
2654                        sb->s_id);
2655
2656                 goto recovery_tree_root;
2657         }
2658
2659         btrfs_set_root_node(&tree_root->root_item, tree_root->node);
2660         tree_root->commit_root = btrfs_root_node(tree_root);
2661         btrfs_set_root_refs(&tree_root->root_item, 1);
2662
2663         location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
2664         location.type = BTRFS_ROOT_ITEM_KEY;
2665         location.offset = 0;
2666
2667         extent_root = btrfs_read_tree_root(tree_root, &location);
2668         if (IS_ERR(extent_root)) {
2669                 ret = PTR_ERR(extent_root);
2670                 goto recovery_tree_root;
2671         }
2672         extent_root->track_dirty = 1;
2673         fs_info->extent_root = extent_root;
2674
2675         location.objectid = BTRFS_DEV_TREE_OBJECTID;
2676         dev_root = btrfs_read_tree_root(tree_root, &location);
2677         if (IS_ERR(dev_root)) {
2678                 ret = PTR_ERR(dev_root);
2679                 goto recovery_tree_root;
2680         }
2681         dev_root->track_dirty = 1;
2682         fs_info->dev_root = dev_root;
2683         btrfs_init_devices_late(fs_info);
2684
2685         location.objectid = BTRFS_CSUM_TREE_OBJECTID;
2686         csum_root = btrfs_read_tree_root(tree_root, &location);
2687         if (IS_ERR(csum_root)) {
2688                 ret = PTR_ERR(csum_root);
2689                 goto recovery_tree_root;
2690         }
2691         csum_root->track_dirty = 1;
2692         fs_info->csum_root = csum_root;
2693
2694         location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2695         quota_root = btrfs_read_tree_root(tree_root, &location);
2696         if (!IS_ERR(quota_root)) {
2697                 quota_root->track_dirty = 1;
2698                 fs_info->quota_enabled = 1;
2699                 fs_info->pending_quota_state = 1;
2700                 fs_info->quota_root = quota_root;
2701         }
2702
2703         location.objectid = BTRFS_UUID_TREE_OBJECTID;
2704         uuid_root = btrfs_read_tree_root(tree_root, &location);
2705         if (IS_ERR(uuid_root)) {
2706                 ret = PTR_ERR(uuid_root);
2707                 if (ret != -ENOENT)
2708                         goto recovery_tree_root;
2709                 create_uuid_tree = true;
2710                 check_uuid_tree = false;
2711         } else {
2712                 uuid_root->track_dirty = 1;
2713                 fs_info->uuid_root = uuid_root;
2714                 create_uuid_tree = false;
2715                 check_uuid_tree =
2716                     generation != btrfs_super_uuid_tree_generation(disk_super);
2717         }
2718
2719         fs_info->generation = generation;
2720         fs_info->last_trans_committed = generation;
2721
2722         ret = btrfs_recover_balance(fs_info);
2723         if (ret) {
2724                 printk(KERN_WARNING "btrfs: failed to recover balance\n");
2725                 goto fail_block_groups;
2726         }
2727
2728         ret = btrfs_init_dev_stats(fs_info);
2729         if (ret) {
2730                 printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n",
2731                        ret);
2732                 goto fail_block_groups;
2733         }
2734
2735         ret = btrfs_init_dev_replace(fs_info);
2736         if (ret) {
2737                 pr_err("btrfs: failed to init dev_replace: %d\n", ret);
2738                 goto fail_block_groups;
2739         }
2740
2741         btrfs_close_extra_devices(fs_info, fs_devices, 1);
2742
2743         ret = btrfs_init_space_info(fs_info);
2744         if (ret) {
2745                 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
2746                 goto fail_block_groups;
2747         }
2748
2749         ret = btrfs_read_block_groups(extent_root);
2750         if (ret) {
2751                 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
2752                 goto fail_block_groups;
2753         }
2754         fs_info->num_tolerated_disk_barrier_failures =
2755                 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2756         if (fs_info->fs_devices->missing_devices >
2757              fs_info->num_tolerated_disk_barrier_failures &&
2758             !(sb->s_flags & MS_RDONLY)) {
2759                 printk(KERN_WARNING
2760                        "Btrfs: too many missing devices, writeable mount is not allowed\n");
2761                 goto fail_block_groups;
2762         }
2763
2764         fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
2765                                                "btrfs-cleaner");
2766         if (IS_ERR(fs_info->cleaner_kthread))
2767                 goto fail_block_groups;
2768
2769         fs_info->transaction_kthread = kthread_run(transaction_kthread,
2770                                                    tree_root,
2771                                                    "btrfs-transaction");
2772         if (IS_ERR(fs_info->transaction_kthread))
2773                 goto fail_cleaner;
2774
2775         if (!btrfs_test_opt(tree_root, SSD) &&
2776             !btrfs_test_opt(tree_root, NOSSD) &&
2777             !fs_info->fs_devices->rotating) {
2778                 printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
2779                        "mode\n");
2780                 btrfs_set_opt(fs_info->mount_opt, SSD);
2781         }
2782
2783 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2784         if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
2785                 ret = btrfsic_mount(tree_root, fs_devices,
2786                                     btrfs_test_opt(tree_root,
2787                                         CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
2788                                     1 : 0,
2789                                     fs_info->check_integrity_print_mask);
2790                 if (ret)
2791                         printk(KERN_WARNING "btrfs: failed to initialize"
2792                                " integrity check module %s\n", sb->s_id);
2793         }
2794 #endif
2795         ret = btrfs_read_qgroup_config(fs_info);
2796         if (ret)
2797                 goto fail_trans_kthread;
2798
2799         /* do not make disk changes in broken FS */
2800         if (btrfs_super_log_root(disk_super) != 0) {
2801                 u64 bytenr = btrfs_super_log_root(disk_super);
2802
2803                 if (fs_devices->rw_devices == 0) {
2804                         printk(KERN_WARNING "Btrfs log replay required "
2805                                "on RO media\n");
2806                         err = -EIO;
2807                         goto fail_qgroup;
2808                 }
2809                 blocksize =
2810                      btrfs_level_size(tree_root,
2811                                       btrfs_super_log_root_level(disk_super));
2812
2813                 log_tree_root = btrfs_alloc_root(fs_info);
2814                 if (!log_tree_root) {
2815                         err = -ENOMEM;
2816                         goto fail_qgroup;
2817                 }
2818
2819                 __setup_root(nodesize, leafsize, sectorsize, stripesize,
2820                              log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
2821
2822                 log_tree_root->node = read_tree_block(tree_root, bytenr,
2823                                                       blocksize,
2824                                                       generation + 1);
2825                 if (!log_tree_root->node ||
2826                     !extent_buffer_uptodate(log_tree_root->node)) {
2827                         printk(KERN_ERR "btrfs: failed to read log tree\n");
2828                         free_extent_buffer(log_tree_root->node);
2829                         kfree(log_tree_root);
2830                         goto fail_trans_kthread;
2831                 }
2832                 /* returns with log_tree_root freed on success */
2833                 ret = btrfs_recover_log_trees(log_tree_root);
2834                 if (ret) {
2835                         btrfs_error(tree_root->fs_info, ret,
2836                                     "Failed to recover log tree");
2837                         free_extent_buffer(log_tree_root->node);
2838                         kfree(log_tree_root);
2839                         goto fail_trans_kthread;
2840                 }
2841
2842                 if (sb->s_flags & MS_RDONLY) {
2843                         ret = btrfs_commit_super(tree_root);
2844                         if (ret)
2845                                 goto fail_trans_kthread;
2846                 }
2847         }
2848
2849         ret = btrfs_find_orphan_roots(tree_root);
2850         if (ret)
2851                 goto fail_trans_kthread;
2852
2853         if (!(sb->s_flags & MS_RDONLY)) {
2854                 ret = btrfs_cleanup_fs_roots(fs_info);
2855                 if (ret)
2856                         goto fail_trans_kthread;
2857
2858                 ret = btrfs_recover_relocation(tree_root);
2859                 if (ret < 0) {
2860                         printk(KERN_WARNING
2861                                "btrfs: failed to recover relocation\n");
2862                         err = -EINVAL;
2863                         goto fail_qgroup;
2864                 }
2865         }
2866
2867         location.objectid = BTRFS_FS_TREE_OBJECTID;
2868         location.type = BTRFS_ROOT_ITEM_KEY;
2869         location.offset = 0;
2870
2871         fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
2872         if (IS_ERR(fs_info->fs_root)) {
2873                 err = PTR_ERR(fs_info->fs_root);
2874                 goto fail_qgroup;
2875         }
2876
2877         if (sb->s_flags & MS_RDONLY)
2878                 return 0;
2879
2880         down_read(&fs_info->cleanup_work_sem);
2881         if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
2882             (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
2883                 up_read(&fs_info->cleanup_work_sem);
2884                 close_ctree(tree_root);
2885                 return ret;
2886         }
2887         up_read(&fs_info->cleanup_work_sem);
2888
2889         ret = btrfs_resume_balance_async(fs_info);
2890         if (ret) {
2891                 printk(KERN_WARNING "btrfs: failed to resume balance\n");
2892                 close_ctree(tree_root);
2893                 return ret;
2894         }
2895
2896         ret = btrfs_resume_dev_replace_async(fs_info);
2897         if (ret) {
2898                 pr_warn("btrfs: failed to resume dev_replace\n");
2899                 close_ctree(tree_root);
2900                 return ret;
2901         }
2902
2903         btrfs_qgroup_rescan_resume(fs_info);
2904
2905         if (create_uuid_tree) {
2906                 pr_info("btrfs: creating UUID tree\n");
2907                 ret = btrfs_create_uuid_tree(fs_info);
2908                 if (ret) {
2909                         pr_warn("btrfs: failed to create the UUID tree %d\n",
2910                                 ret);
2911                         close_ctree(tree_root);
2912                         return ret;
2913                 }
2914         } else if (check_uuid_tree ||
2915                    btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) {
2916                 pr_info("btrfs: checking UUID tree\n");
2917                 ret = btrfs_check_uuid_tree(fs_info);
2918                 if (ret) {
2919                         pr_warn("btrfs: failed to check the UUID tree %d\n",
2920                                 ret);
2921                         close_ctree(tree_root);
2922                         return ret;
2923                 }
2924         } else {
2925                 fs_info->update_uuid_tree_gen = 1;
2926         }
2927
2928         return 0;
2929
2930 fail_qgroup:
2931         btrfs_free_qgroup_config(fs_info);
2932 fail_trans_kthread:
2933         kthread_stop(fs_info->transaction_kthread);
2934         btrfs_cleanup_transaction(fs_info->tree_root);
2935         del_fs_roots(fs_info);
2936 fail_cleaner:
2937         kthread_stop(fs_info->cleaner_kthread);
2938
2939         /*
2940          * make sure we're done with the btree inode before we stop our
2941          * kthreads
2942          */
2943         filemap_write_and_wait(fs_info->btree_inode->i_mapping);
2944
2945 fail_block_groups:
2946         btrfs_put_block_group_cache(fs_info);
2947         btrfs_free_block_groups(fs_info);
2948
2949 fail_tree_roots:
2950         free_root_pointers(fs_info, 1);
2951         invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2952
2953 fail_sb_buffer:
2954         btrfs_stop_all_workers(fs_info);
2955 fail_alloc:
2956 fail_iput:
2957         btrfs_mapping_tree_free(&fs_info->mapping_tree);
2958
2959         iput(fs_info->btree_inode);
2960 fail_delalloc_bytes:
2961         percpu_counter_destroy(&fs_info->delalloc_bytes);
2962 fail_dirty_metadata_bytes:
2963         percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
2964 fail_bdi:
2965         bdi_destroy(&fs_info->bdi);
2966 fail_srcu:
2967         cleanup_srcu_struct(&fs_info->subvol_srcu);
2968 fail:
2969         btrfs_free_stripe_hash_table(fs_info);
2970         btrfs_close_devices(fs_info->fs_devices);
2971         return err;
2972
2973 recovery_tree_root:
2974         if (!btrfs_test_opt(tree_root, RECOVERY))
2975                 goto fail_tree_roots;
2976
2977         free_root_pointers(fs_info, 0);
2978
2979         /* don't use the log in recovery mode, it won't be valid */
2980         btrfs_set_super_log_root(disk_super, 0);
2981
2982         /* we can't trust the free space cache either */
2983         btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2984
2985         ret = next_root_backup(fs_info, fs_info->super_copy,
2986                                &num_backups_tried, &backup_index);
2987         if (ret == -1)
2988                 goto fail_block_groups;
2989         goto retry_root_backup;
2990 }
2991
2992 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
2993 {
2994         if (uptodate) {
2995                 set_buffer_uptodate(bh);
2996         } else {
2997                 struct btrfs_device *device = (struct btrfs_device *)
2998                         bh->b_private;
2999
3000                 printk_ratelimited_in_rcu(KERN_WARNING "lost page write due to "
3001                                           "I/O error on %s\n",
3002                                           rcu_str_deref(device->name));
3003                 /* note, we dont' set_buffer_write_io_error because we have
3004                  * our own ways of dealing with the IO errors
3005                  */
3006                 clear_buffer_uptodate(bh);
3007                 btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
3008         }
3009         unlock_buffer(bh);
3010         put_bh(bh);
3011 }
3012
3013 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
3014 {
3015         struct buffer_head *bh;
3016         struct buffer_head *latest = NULL;
3017         struct btrfs_super_block *super;
3018         int i;
3019         u64 transid = 0;
3020         u64 bytenr;
3021
3022         /* we would like to check all the supers, but that would make
3023          * a btrfs mount succeed after a mkfs from a different FS.
3024          * So, we need to add a special mount option to scan for
3025          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
3026          */
3027         for (i = 0; i < 1; i++) {
3028                 bytenr = btrfs_sb_offset(i);
3029                 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3030                                         i_size_read(bdev->bd_inode))
3031                         break;
3032                 bh = __bread(bdev, bytenr / 4096,
3033                                         BTRFS_SUPER_INFO_SIZE);
3034                 if (!bh)
3035                         continue;
3036
3037                 super = (struct btrfs_super_block *)bh->b_data;
3038                 if (btrfs_super_bytenr(super) != bytenr ||
3039                     btrfs_super_magic(super) != BTRFS_MAGIC) {
3040                         brelse(bh);
3041                         continue;
3042                 }
3043
3044                 if (!latest || btrfs_super_generation(super) > transid) {
3045                         brelse(latest);
3046                         latest = bh;
3047                         transid = btrfs_super_generation(super);
3048                 } else {
3049                         brelse(bh);
3050                 }
3051         }
3052         return latest;
3053 }
3054
3055 /*
3056  * this should be called twice, once with wait == 0 and
3057  * once with wait == 1.  When wait == 0 is done, all the buffer heads
3058  * we write are pinned.
3059  *
3060  * They are released when wait == 1 is done.
3061  * max_mirrors must be the same for both runs, and it indicates how
3062  * many supers on this one device should be written.
3063  *
3064  * max_mirrors == 0 means to write them all.
3065  */
3066 static int write_dev_supers(struct btrfs_device *device,
3067                             struct btrfs_super_block *sb,
3068                             int do_barriers, int wait, int max_mirrors)
3069 {
3070         struct buffer_head *bh;
3071         int i;
3072         int ret;
3073         int errors = 0;
3074         u32 crc;
3075         u64 bytenr;
3076
3077         if (max_mirrors == 0)
3078                 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3079
3080         for (i = 0; i < max_mirrors; i++) {
3081                 bytenr = btrfs_sb_offset(i);
3082                 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
3083                         break;
3084
3085                 if (wait) {
3086                         bh = __find_get_block(device->bdev, bytenr / 4096,
3087                                               BTRFS_SUPER_INFO_SIZE);
3088                         if (!bh) {
3089                                 errors++;
3090                                 continue;
3091                         }
3092                         wait_on_buffer(bh);
3093                         if (!buffer_uptodate(bh))
3094                                 errors++;
3095
3096                         /* drop our reference */
3097                         brelse(bh);
3098
3099                         /* drop the reference from the wait == 0 run */
3100                         brelse(bh);
3101                         continue;
3102                 } else {
3103                         btrfs_set_super_bytenr(sb, bytenr);
3104
3105                         crc = ~(u32)0;
3106                         crc = btrfs_csum_data((char *)sb +
3107                                               BTRFS_CSUM_SIZE, crc,
3108                                               BTRFS_SUPER_INFO_SIZE -
3109                                               BTRFS_CSUM_SIZE);
3110                         btrfs_csum_final(crc, sb->csum);
3111
3112                         /*
3113                          * one reference for us, and we leave it for the
3114                          * caller
3115                          */
3116                         bh = __getblk(device->bdev, bytenr / 4096,
3117                                       BTRFS_SUPER_INFO_SIZE);
3118                         if (!bh) {
3119                                 printk(KERN_ERR "btrfs: couldn't get super "
3120                                        "buffer head for bytenr %Lu\n", bytenr);
3121                                 errors++;
3122                                 continue;
3123                         }
3124
3125                         memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
3126
3127                         /* one reference for submit_bh */
3128                         get_bh(bh);
3129
3130                         set_buffer_uptodate(bh);
3131                         lock_buffer(bh);
3132                         bh->b_end_io = btrfs_end_buffer_write_sync;
3133                         bh->b_private = device;
3134                 }
3135
3136                 /*
3137                  * we fua the first super.  The others we allow
3138                  * to go down lazy.
3139                  */
3140                 ret = btrfsic_submit_bh(WRITE_FUA, bh);
3141                 if (ret)
3142                         errors++;
3143         }
3144         return errors < i ? 0 : -1;
3145 }
3146
3147 /*
3148  * endio for the write_dev_flush, this will wake anyone waiting
3149  * for the barrier when it is done
3150  */
3151 static void btrfs_end_empty_barrier(struct bio *bio, int err)
3152 {
3153         if (err) {
3154                 if (err == -EOPNOTSUPP)
3155                         set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
3156                 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3157         }
3158         if (bio->bi_private)
3159                 complete(bio->bi_private);
3160         bio_put(bio);
3161 }
3162
3163 /*
3164  * trigger flushes for one the devices.  If you pass wait == 0, the flushes are
3165  * sent down.  With wait == 1, it waits for the previous flush.
3166  *
3167  * any device where the flush fails with eopnotsupp are flagged as not-barrier
3168  * capable
3169  */
3170 static int write_dev_flush(struct btrfs_device *device, int wait)
3171 {
3172         struct bio *bio;
3173         int ret = 0;
3174
3175         if (device->nobarriers)
3176                 return 0;
3177
3178         if (wait) {
3179                 bio = device->flush_bio;
3180                 if (!bio)
3181                         return 0;
3182
3183                 wait_for_completion(&device->flush_wait);
3184
3185                 if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
3186                         printk_in_rcu("btrfs: disabling barriers on dev %s\n",
3187                                       rcu_str_deref(device->name));
3188                         device->nobarriers = 1;
3189                 } else if (!bio_flagged(bio, BIO_UPTODATE)) {
3190                         ret = -EIO;
3191                         btrfs_dev_stat_inc_and_print(device,
3192                                 BTRFS_DEV_STAT_FLUSH_ERRS);
3193                 }
3194
3195                 /* drop the reference from the wait == 0 run */
3196                 bio_put(bio);
3197                 device->flush_bio = NULL;
3198
3199                 return ret;
3200         }
3201
3202         /*
3203          * one reference for us, and we leave it for the
3204          * caller
3205          */
3206         device->flush_bio = NULL;
3207         bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
3208         if (!bio)
3209                 return -ENOMEM;
3210
3211         bio->bi_end_io = btrfs_end_empty_barrier;
3212         bio->bi_bdev = device->bdev;
3213         init_completion(&device->flush_wait);
3214         bio->bi_private = &device->flush_wait;
3215         device->flush_bio = bio;
3216
3217         bio_get(bio);
3218         btrfsic_submit_bio(WRITE_FLUSH, bio);
3219
3220         return 0;
3221 }
3222
3223 /*
3224  * send an empty flush down to each device in parallel,
3225  * then wait for them
3226  */
3227 static int barrier_all_devices(struct btrfs_fs_info *info)
3228 {
3229         struct list_head *head;
3230         struct btrfs_device *dev;
3231         int errors_send = 0;
3232         int errors_wait = 0;
3233         int ret;
3234
3235         /* send down all the barriers */
3236         head = &info->fs_devices->devices;
3237         list_for_each_entry_rcu(dev, head, dev_list) {
3238                 if (!dev->bdev) {
3239                         errors_send++;
3240                         continue;
3241                 }
3242                 if (!dev->in_fs_metadata || !dev->writeable)
3243                         continue;
3244
3245                 ret = write_dev_flush(dev, 0);
3246                 if (ret)
3247                         errors_send++;
3248         }
3249
3250         /* wait for all the barriers */
3251         list_for_each_entry_rcu(dev, head, dev_list) {
3252                 if (!dev->bdev) {
3253                         errors_wait++;
3254                         continue;
3255                 }
3256                 if (!dev->in_fs_metadata || !dev->writeable)
3257                         continue;
3258
3259                 ret = write_dev_flush(dev, 1);
3260                 if (ret)
3261                         errors_wait++;
3262         }
3263         if (errors_send > info->num_tolerated_disk_barrier_failures ||
3264             errors_wait > info->num_tolerated_disk_barrier_failures)
3265                 return -EIO;
3266         return 0;
3267 }
3268
3269 int btrfs_calc_num_tolerated_disk_barrier_failures(
3270         struct btrfs_fs_info *fs_info)
3271 {
3272         struct btrfs_ioctl_space_info space;
3273         struct btrfs_space_info *sinfo;
3274         u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
3275                        BTRFS_BLOCK_GROUP_SYSTEM,
3276                        BTRFS_BLOCK_GROUP_METADATA,
3277                        BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
3278         int num_types = 4;
3279         int i;
3280         int c;
3281         int num_tolerated_disk_barrier_failures =
3282                 (int)fs_info->fs_devices->num_devices;
3283
3284         for (i = 0; i < num_types; i++) {
3285                 struct btrfs_space_info *tmp;
3286
3287                 sinfo = NULL;
3288                 rcu_read_lock();
3289                 list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
3290                         if (tmp->flags == types[i]) {
3291                                 sinfo = tmp;
3292                                 break;
3293                         }
3294                 }
3295                 rcu_read_unlock();
3296
3297                 if (!sinfo)
3298                         continue;
3299
3300                 down_read(&sinfo->groups_sem);
3301                 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
3302                         if (!list_empty(&sinfo->block_groups[c])) {
3303                                 u64 flags;
3304
3305                                 btrfs_get_block_group_info(
3306                                         &sinfo->block_groups[c], &space);
3307                                 if (space.total_bytes == 0 ||
3308                                     space.used_bytes == 0)
3309                                         continue;
3310                                 flags = space.flags;
3311                                 /*
3312                                  * return
3313                                  * 0: if dup, single or RAID0 is configured for
3314                                  *    any of metadata, system or data, else
3315                                  * 1: if RAID5 is configured, or if RAID1 or
3316                                  *    RAID10 is configured and only two mirrors
3317                                  *    are used, else
3318                                  * 2: if RAID6 is configured, else
3319                                  * num_mirrors - 1: if RAID1 or RAID10 is
3320                                  *                  configured and more than
3321                                  *                  2 mirrors are used.
3322                                  */
3323                                 if (num_tolerated_disk_barrier_failures > 0 &&
3324                                     ((flags & (BTRFS_BLOCK_GROUP_DUP |
3325                                                BTRFS_BLOCK_GROUP_RAID0)) ||
3326                                      ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
3327                                       == 0)))
3328                                         num_tolerated_disk_barrier_failures = 0;
3329                                 else if (num_tolerated_disk_barrier_failures > 1) {
3330                                         if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3331                                             BTRFS_BLOCK_GROUP_RAID5 |
3332                                             BTRFS_BLOCK_GROUP_RAID10)) {
3333                                                 num_tolerated_disk_barrier_failures = 1;
3334                                         } else if (flags &
3335                                                    BTRFS_BLOCK_GROUP_RAID6) {
3336                                                 num_tolerated_disk_barrier_failures = 2;
3337                                         }
3338                                 }
3339                         }
3340                 }
3341                 up_read(&sinfo->groups_sem);
3342         }
3343
3344         return num_tolerated_disk_barrier_failures;
3345 }
3346
3347 static int write_all_supers(struct btrfs_root *root, int max_mirrors)
3348 {
3349         struct list_head *head;
3350         struct btrfs_device *dev;
3351         struct btrfs_super_block *sb;
3352         struct btrfs_dev_item *dev_item;
3353         int ret;
3354         int do_barriers;
3355         int max_errors;
3356         int total_errors = 0;
3357         u64 flags;
3358
3359         do_barriers = !btrfs_test_opt(root, NOBARRIER);
3360         backup_super_roots(root->fs_info);
3361
3362         sb = root->fs_info->super_for_commit;
3363         dev_item = &sb->dev_item;
3364
3365         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
3366         head = &root->fs_info->fs_devices->devices;
3367         max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
3368
3369         if (do_barriers) {
3370                 ret = barrier_all_devices(root->fs_info);
3371                 if (ret) {
3372                         mutex_unlock(
3373                                 &root->fs_info->fs_devices->device_list_mutex);
3374                         btrfs_error(root->fs_info, ret,
3375                                     "errors while submitting device barriers.");
3376                         return ret;
3377                 }
3378         }
3379
3380         list_for_each_entry_rcu(dev, head, dev_list) {
3381                 if (!dev->bdev) {
3382                         total_errors++;
3383                         continue;
3384                 }
3385                 if (!dev->in_fs_metadata || !dev->writeable)
3386                         continue;
3387
3388                 btrfs_set_stack_device_generation(dev_item, 0);
3389                 btrfs_set_stack_device_type(dev_item, dev->type);
3390                 btrfs_set_stack_device_id(dev_item, dev->devid);
3391                 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
3392                 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
3393                 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
3394                 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
3395                 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
3396                 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
3397                 memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
3398
3399                 flags = btrfs_super_flags(sb);
3400                 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
3401
3402                 ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
3403                 if (ret)
3404                         total_errors++;
3405         }
3406         if (total_errors > max_errors) {
3407                 printk(KERN_ERR "btrfs: %d errors while writing supers\n",
3408                        total_errors);
3409                 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3410
3411                 /* FUA is masked off if unsupported and can't be the reason */
3412                 btrfs_error(root->fs_info, -EIO,
3413                             "%d errors while writing supers", total_errors);
3414                 return -EIO;
3415         }
3416
3417         total_errors = 0;
3418         list_for_each_entry_rcu(dev, head, dev_list) {
3419                 if (!dev->bdev)
3420                         continue;
3421                 if (!dev->in_fs_metadata || !dev->writeable)
3422                         continue;
3423
3424                 ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
3425                 if (ret)
3426                         total_errors++;
3427         }
3428         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3429         if (total_errors > max_errors) {
3430                 btrfs_error(root->fs_info, -EIO,
3431                             "%d errors while writing supers", total_errors);
3432                 return -EIO;
3433         }
3434         return 0;
3435 }
3436
3437 int write_ctree_super(struct btrfs_trans_handle *trans,
3438                       struct btrfs_root *root, int max_mirrors)
3439 {
3440         return write_all_supers(root, max_mirrors);
3441 }
3442
3443 /* Drop a fs root from the radix tree and free it. */
3444 void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
3445                                   struct btrfs_root *root)
3446 {
3447         spin_lock(&fs_info->fs_roots_radix_lock);
3448         radix_tree_delete(&fs_info->fs_roots_radix,
3449                           (unsigned long)root->root_key.objectid);
3450         spin_unlock(&fs_info->fs_roots_radix_lock);
3451
3452         if (btrfs_root_refs(&root->root_item) == 0)
3453                 synchronize_srcu(&fs_info->subvol_srcu);
3454
3455         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
3456                 btrfs_free_log(NULL, root);
3457                 btrfs_free_log_root_tree(NULL, fs_info);
3458         }
3459
3460         __btrfs_remove_free_space_cache(root->free_ino_pinned);
3461         __btrfs_remove_free_space_cache(root->free_ino_ctl);
3462         free_fs_root(root);
3463 }
3464
3465 static void free_fs_root(struct btrfs_root *root)
3466 {
3467         iput(root->cache_inode);
3468         WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
3469         btrfs_free_block_rsv(root, root->orphan_block_rsv);
3470         root->orphan_block_rsv = NULL;
3471         if (root->anon_dev)
3472                 free_anon_bdev(root->anon_dev);
3473         free_extent_buffer(root->node);
3474         free_extent_buffer(root->commit_root);
3475         kfree(root->free_ino_ctl);
3476         kfree(root->free_ino_pinned);
3477         kfree(root->name);
3478         btrfs_put_fs_root(root);
3479 }
3480
3481 void btrfs_free_fs_root(struct btrfs_root *root)
3482 {
3483         free_fs_root(root);
3484 }
3485
3486 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
3487 {
3488         u64 root_objectid = 0;
3489         struct btrfs_root *gang[8];
3490         int i;
3491         int ret;
3492
3493         while (1) {
3494                 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
3495                                              (void **)gang, root_objectid,
3496                                              ARRAY_SIZE(gang));
3497                 if (!ret)
3498                         break;
3499
3500                 root_objectid = gang[ret - 1]->root_key.objectid + 1;
3501                 for (i = 0; i < ret; i++) {
3502                         int err;
3503
3504                         root_objectid = gang[i]->root_key.objectid;
3505                         err = btrfs_orphan_cleanup(gang[i]);
3506                         if (err)
3507                                 return err;
3508                 }
3509                 root_objectid++;
3510         }
3511         return 0;
3512 }
3513
3514 int btrfs_commit_super(struct btrfs_root *root)
3515 {
3516         struct btrfs_trans_handle *trans;
3517
3518         mutex_lock(&root->fs_info->cleaner_mutex);
3519         btrfs_run_delayed_iputs(root);
3520         mutex_unlock(&root->fs_info->cleaner_mutex);
3521         wake_up_process(root->fs_info->cleaner_kthread);
3522
3523         /* wait until ongoing cleanup work done */
3524         down_write(&root->fs_info->cleanup_work_sem);
3525         up_write(&root->fs_info->cleanup_work_sem);
3526
3527         trans = btrfs_join_transaction(root);
3528         if (IS_ERR(trans))
3529                 return PTR_ERR(trans);
3530         return btrfs_commit_transaction(trans, root);
3531 }
3532
3533 int close_ctree(struct btrfs_root *root)
3534 {
3535         struct btrfs_fs_info *fs_info = root->fs_info;
3536         int ret;
3537
3538         fs_info->closing = 1;
3539         smp_mb();
3540
3541         /* wait for the uuid_scan task to finish */
3542         down(&fs_info->uuid_tree_rescan_sem);
3543         /* avoid complains from lockdep et al., set sem back to initial state */
3544         up(&fs_info->uuid_tree_rescan_sem);
3545
3546         /* pause restriper - we want to resume on mount */
3547         btrfs_pause_balance(fs_info);
3548
3549         btrfs_dev_replace_suspend_for_unmount(fs_info);
3550
3551         btrfs_scrub_cancel(fs_info);
3552
3553         /* wait for any defraggers to finish */
3554         wait_event(fs_info->transaction_wait,
3555                    (atomic_read(&fs_info->defrag_running) == 0));
3556
3557         /* clear out the rbtree of defraggable inodes */
3558         btrfs_cleanup_defrag_inodes(fs_info);
3559
3560         if (!(fs_info->sb->s_flags & MS_RDONLY)) {
3561                 ret = btrfs_commit_super(root);
3562                 if (ret)
3563                         printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
3564         }
3565
3566         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3567                 btrfs_error_commit_super(root);
3568
3569         btrfs_put_block_group_cache(fs_info);
3570
3571         kthread_stop(fs_info->transaction_kthread);
3572         kthread_stop(fs_info->cleaner_kthread);
3573
3574         fs_info->closing = 2;
3575         smp_mb();
3576
3577         btrfs_free_qgroup_config(root->fs_info);
3578
3579         if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
3580                 printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n",
3581                        percpu_counter_sum(&fs_info->delalloc_bytes));
3582         }
3583
3584         del_fs_roots(fs_info);
3585
3586         btrfs_free_block_groups(fs_info);
3587
3588         btrfs_stop_all_workers(fs_info);
3589
3590         free_root_pointers(fs_info, 1);
3591
3592         iput(fs_info->btree_inode);
3593
3594 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3595         if (btrfs_test_opt(root, CHECK_INTEGRITY))
3596                 btrfsic_unmount(root, fs_info->fs_devices);
3597 #endif
3598
3599         btrfs_close_devices(fs_info->fs_devices);
3600         btrfs_mapping_tree_free(&fs_info->mapping_tree);
3601
3602         percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
3603         percpu_counter_destroy(&fs_info->delalloc_bytes);
3604         bdi_destroy(&fs_info->bdi);
3605         cleanup_srcu_struct(&fs_info->subvol_srcu);
3606
3607         btrfs_free_stripe_hash_table(fs_info);
3608
3609         btrfs_free_block_rsv(root, root->orphan_block_rsv);
3610         root->orphan_block_rsv = NULL;
3611
3612         return 0;
3613 }
3614
3615 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
3616                           int atomic)
3617 {
3618         int ret;
3619         struct inode *btree_inode = buf->pages[0]->mapping->host;
3620
3621         ret = extent_buffer_uptodate(buf);
3622         if (!ret)
3623                 return ret;
3624
3625         ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
3626                                     parent_transid, atomic);
3627         if (ret == -EAGAIN)
3628                 return ret;
3629         return !ret;
3630 }
3631
3632 int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
3633 {
3634         return set_extent_buffer_uptodate(buf);
3635 }
3636
3637 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3638 {
3639         struct btrfs_root *root;
3640         u64 transid = btrfs_header_generation(buf);
3641         int was_dirty;
3642
3643 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3644         /*
3645          * This is a fast path so only do this check if we have sanity tests
3646          * enabled.  Normal people shouldn't be marking dummy buffers as dirty
3647          * outside of the sanity tests.
3648          */
3649         if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags)))
3650                 return;
3651 #endif
3652         root = BTRFS_I(buf->pages[0]->mapping->host)->root;
3653         btrfs_assert_tree_locked(buf);
3654         if (transid != root->fs_info->generation)
3655                 WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
3656                        "found %llu running %llu\n",
3657                         buf->start, transid, root->fs_info->generation);
3658         was_dirty = set_extent_buffer_dirty(buf);
3659         if (!was_dirty)
3660                 __percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
3661                                      buf->len,
3662                                      root->fs_info->dirty_metadata_batch);
3663 }
3664
3665 static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
3666                                         int flush_delayed)
3667 {
3668         /*
3669          * looks as though older kernels can get into trouble with
3670          * this code, they end up stuck in balance_dirty_pages forever
3671          */
3672         int ret;
3673
3674         if (current->flags & PF_MEMALLOC)
3675                 return;
3676
3677         if (flush_delayed)
3678                 btrfs_balance_delayed_items(root);
3679
3680         ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
3681                                      BTRFS_DIRTY_METADATA_THRESH);
3682         if (ret > 0) {
3683                 balance_dirty_pages_ratelimited(
3684                                    root->fs_info->btree_inode->i_mapping);
3685         }
3686         return;
3687 }
3688
3689 void btrfs_btree_balance_dirty(struct btrfs_root *root)
3690 {
3691         __btrfs_btree_balance_dirty(root, 1);
3692 }
3693
3694 void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
3695 {
3696         __btrfs_btree_balance_dirty(root, 0);
3697 }
3698
3699 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
3700 {
3701         struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
3702         return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
3703 }
3704
3705 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3706                               int read_only)
3707 {
3708         /*
3709          * Placeholder for checks
3710          */
3711         return 0;
3712 }
3713
3714 static void btrfs_error_commit_super(struct btrfs_root *root)
3715 {
3716         mutex_lock(&root->fs_info->cleaner_mutex);
3717         btrfs_run_delayed_iputs(root);
3718         mutex_unlock(&root->fs_info->cleaner_mutex);
3719
3720         down_write(&root->fs_info->cleanup_work_sem);
3721         up_write(&root->fs_info->cleanup_work_sem);
3722
3723         /* cleanup FS via transaction */
3724         btrfs_cleanup_transaction(root);
3725 }
3726
3727 static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3728                                              struct btrfs_root *root)
3729 {
3730         struct btrfs_inode *btrfs_inode;
3731         struct list_head splice;
3732
3733         INIT_LIST_HEAD(&splice);
3734
3735         mutex_lock(&root->fs_info->ordered_operations_mutex);
3736         spin_lock(&root->fs_info->ordered_root_lock);
3737
3738         list_splice_init(&t->ordered_operations, &splice);
3739         while (!list_empty(&splice)) {
3740                 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
3741                                          ordered_operations);
3742
3743                 list_del_init(&btrfs_inode->ordered_operations);
3744                 spin_unlock(&root->fs_info->ordered_root_lock);
3745
3746                 btrfs_invalidate_inodes(btrfs_inode->root);
3747
3748                 spin_lock(&root->fs_info->ordered_root_lock);
3749         }
3750
3751         spin_unlock(&root->fs_info->ordered_root_lock);
3752         mutex_unlock(&root->fs_info->ordered_operations_mutex);
3753 }
3754
3755 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
3756 {
3757         struct btrfs_ordered_extent *ordered;
3758
3759         spin_lock(&root->ordered_extent_lock);
3760         /*
3761          * This will just short circuit the ordered completion stuff which will
3762          * make sure the ordered extent gets properly cleaned up.
3763          */
3764         list_for_each_entry(ordered, &root->ordered_extents,
3765                             root_extent_list)
3766                 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
3767         spin_unlock(&root->ordered_extent_lock);
3768 }
3769
3770 static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
3771 {
3772         struct btrfs_root *root;
3773         struct list_head splice;
3774
3775         INIT_LIST_HEAD(&splice);
3776
3777         spin_lock(&fs_info->ordered_root_lock);
3778         list_splice_init(&fs_info->ordered_roots, &splice);
3779         while (!list_empty(&splice)) {
3780                 root = list_first_entry(&splice, struct btrfs_root,
3781                                         ordered_root);
3782                 list_move_tail(&root->ordered_root,
3783                                &fs_info->ordered_roots);
3784
3785                 btrfs_destroy_ordered_extents(root);
3786
3787                 cond_resched_lock(&fs_info->ordered_root_lock);
3788         }
3789         spin_unlock(&fs_info->ordered_root_lock);
3790 }
3791
3792 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3793                                       struct btrfs_root *root)
3794 {
3795         struct rb_node *node;
3796         struct btrfs_delayed_ref_root *delayed_refs;
3797         struct btrfs_delayed_ref_node *ref;
3798         int ret = 0;
3799
3800         delayed_refs = &trans->delayed_refs;
3801
3802         spin_lock(&delayed_refs->lock);
3803         if (delayed_refs->num_entries == 0) {
3804                 spin_unlock(&delayed_refs->lock);
3805                 printk(KERN_INFO "delayed_refs has NO entry\n");
3806                 return ret;
3807         }
3808
3809         while ((node = rb_first(&delayed_refs->root)) != NULL) {
3810                 struct btrfs_delayed_ref_head *head = NULL;
3811                 bool pin_bytes = false;
3812
3813                 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3814                 atomic_set(&ref->refs, 1);
3815                 if (btrfs_delayed_ref_is_head(ref)) {
3816
3817                         head = btrfs_delayed_node_to_head(ref);
3818                         if (!mutex_trylock(&head->mutex)) {
3819                                 atomic_inc(&ref->refs);
3820                                 spin_unlock(&delayed_refs->lock);
3821
3822                                 /* Need to wait for the delayed ref to run */
3823                                 mutex_lock(&head->mutex);
3824                                 mutex_unlock(&head->mutex);
3825                                 btrfs_put_delayed_ref(ref);
3826
3827                                 spin_lock(&delayed_refs->lock);
3828                                 continue;
3829                         }
3830
3831                         if (head->must_insert_reserved)
3832                                 pin_bytes = true;
3833                         btrfs_free_delayed_extent_op(head->extent_op);
3834                         delayed_refs->num_heads--;
3835                         if (list_empty(&head->cluster))
3836                                 delayed_refs->num_heads_ready--;
3837                         list_del_init(&head->cluster);
3838                 }
3839
3840                 ref->in_tree = 0;
3841                 rb_erase(&ref->rb_node, &delayed_refs->root);
3842                 delayed_refs->num_entries--;
3843                 spin_unlock(&delayed_refs->lock);
3844                 if (head) {
3845                         if (pin_bytes)
3846                                 btrfs_pin_extent(root, ref->bytenr,
3847                                                  ref->num_bytes, 1);
3848                         mutex_unlock(&head->mutex);
3849                 }
3850                 btrfs_put_delayed_ref(ref);
3851
3852                 cond_resched();
3853                 spin_lock(&delayed_refs->lock);
3854         }
3855
3856         spin_unlock(&delayed_refs->lock);
3857
3858         return ret;
3859 }
3860
3861 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
3862 {
3863         struct btrfs_inode *btrfs_inode;
3864         struct list_head splice;
3865
3866         INIT_LIST_HEAD(&splice);
3867
3868         spin_lock(&root->delalloc_lock);
3869         list_splice_init(&root->delalloc_inodes, &splice);
3870
3871         while (!list_empty(&splice)) {
3872                 btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
3873                                                delalloc_inodes);
3874
3875                 list_del_init(&btrfs_inode->delalloc_inodes);
3876                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
3877                           &btrfs_inode->runtime_flags);
3878                 spin_unlock(&root->delalloc_lock);
3879
3880                 btrfs_invalidate_inodes(btrfs_inode->root);
3881
3882                 spin_lock(&root->delalloc_lock);
3883         }
3884
3885         spin_unlock(&root->delalloc_lock);
3886 }
3887
3888 static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
3889 {
3890         struct btrfs_root *root;
3891         struct list_head splice;
3892
3893         INIT_LIST_HEAD(&splice);
3894
3895         spin_lock(&fs_info->delalloc_root_lock);
3896         list_splice_init(&fs_info->delalloc_roots, &splice);
3897         while (!list_empty(&splice)) {
3898                 root = list_first_entry(&splice, struct btrfs_root,
3899                                          delalloc_root);
3900                 list_del_init(&root->delalloc_root);
3901                 root = btrfs_grab_fs_root(root);
3902                 BUG_ON(!root);
3903                 spin_unlock(&fs_info->delalloc_root_lock);
3904
3905                 btrfs_destroy_delalloc_inodes(root);
3906                 btrfs_put_fs_root(root);
3907
3908                 spin_lock(&fs_info->delalloc_root_lock);
3909         }
3910         spin_unlock(&fs_info->delalloc_root_lock);
3911 }
3912
3913 static int btrfs_destroy_marked_extents(struct btrfs_root *root,
3914                                         struct extent_io_tree *dirty_pages,
3915                                         int mark)
3916 {
3917         int ret;
3918         struct extent_buffer *eb;
3919         u64 start = 0;
3920         u64 end;
3921
3922         while (1) {
3923                 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
3924                                             mark, NULL);
3925                 if (ret)
3926                         break;
3927
3928                 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
3929                 while (start <= end) {
3930                         eb = btrfs_find_tree_block(root, start,
3931                                                    root->leafsize);
3932                         start += root->leafsize;
3933                         if (!eb)
3934                                 continue;
3935                         wait_on_extent_buffer_writeback(eb);
3936
3937                         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
3938                                                &eb->bflags))
3939                                 clear_extent_buffer_dirty(eb);
3940                         free_extent_buffer_stale(eb);
3941                 }
3942         }
3943
3944         return ret;
3945 }
3946
3947 static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
3948                                        struct extent_io_tree *pinned_extents)
3949 {
3950         struct extent_io_tree *unpin;
3951         u64 start;
3952         u64 end;
3953         int ret;
3954         bool loop = true;
3955
3956         unpin = pinned_extents;
3957 again:
3958         while (1) {
3959                 ret = find_first_extent_bit(unpin, 0, &start, &end,
3960                                             EXTENT_DIRTY, NULL);
3961                 if (ret)
3962                         break;
3963
3964                 /* opt_discard */
3965                 if (btrfs_test_opt(root, DISCARD))
3966                         ret = btrfs_error_discard_extent(root, start,
3967                                                          end + 1 - start,
3968                                                          NULL);
3969
3970                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
3971                 btrfs_error_unpin_extent_range(root, start, end);
3972                 cond_resched();
3973         }
3974
3975         if (loop) {
3976                 if (unpin == &root->fs_info->freed_extents[0])
3977                         unpin = &root->fs_info->freed_extents[1];
3978                 else
3979                         unpin = &root->fs_info->freed_extents[0];
3980                 loop = false;
3981                 goto again;
3982         }
3983
3984         return 0;
3985 }
3986
3987 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
3988                                    struct btrfs_root *root)
3989 {
3990         btrfs_destroy_ordered_operations(cur_trans, root);
3991
3992         btrfs_destroy_delayed_refs(cur_trans, root);
3993
3994         cur_trans->state = TRANS_STATE_COMMIT_START;
3995         wake_up(&root->fs_info->transaction_blocked_wait);
3996
3997         cur_trans->state = TRANS_STATE_UNBLOCKED;
3998         wake_up(&root->fs_info->transaction_wait);
3999
4000         btrfs_destroy_delayed_inodes(root);
4001         btrfs_assert_delayed_root_empty(root);
4002
4003         btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages,
4004                                      EXTENT_DIRTY);
4005         btrfs_destroy_pinned_extent(root,
4006                                     root->fs_info->pinned_extents);
4007
4008         cur_trans->state =TRANS_STATE_COMPLETED;
4009         wake_up(&cur_trans->commit_wait);
4010
4011         /*
4012         memset(cur_trans, 0, sizeof(*cur_trans));
4013         kmem_cache_free(btrfs_transaction_cachep, cur_trans);
4014         */
4015 }
4016
4017 static int btrfs_cleanup_transaction(struct btrfs_root *root)
4018 {
4019         struct btrfs_transaction *t;
4020
4021         mutex_lock(&root->fs_info->transaction_kthread_mutex);
4022
4023         spin_lock(&root->fs_info->trans_lock);
4024         while (!list_empty(&root->fs_info->trans_list)) {
4025                 t = list_first_entry(&root->fs_info->trans_list,
4026                                      struct btrfs_transaction, list);
4027                 if (t->state >= TRANS_STATE_COMMIT_START) {
4028                         atomic_inc(&t->use_count);
4029                         spin_unlock(&root->fs_info->trans_lock);
4030                         btrfs_wait_for_commit(root, t->transid);
4031                         btrfs_put_transaction(t);
4032                         spin_lock(&root->fs_info->trans_lock);
4033                         continue;
4034                 }
4035                 if (t == root->fs_info->running_transaction) {
4036                         t->state = TRANS_STATE_COMMIT_DOING;
4037                         spin_unlock(&root->fs_info->trans_lock);
4038                         /*
4039                          * We wait for 0 num_writers since we don't hold a trans
4040                          * handle open currently for this transaction.
4041                          */
4042                         wait_event(t->writer_wait,
4043                                    atomic_read(&t->num_writers) == 0);
4044                 } else {
4045                         spin_unlock(&root->fs_info->trans_lock);
4046                 }
4047                 btrfs_cleanup_one_transaction(t, root);
4048
4049                 spin_lock(&root->fs_info->trans_lock);
4050                 if (t == root->fs_info->running_transaction)
4051                         root->fs_info->running_transaction = NULL;
4052                 list_del_init(&t->list);
4053                 spin_unlock(&root->fs_info->trans_lock);
4054
4055                 btrfs_put_transaction(t);
4056                 trace_btrfs_transaction_commit(root);
4057                 spin_lock(&root->fs_info->trans_lock);
4058         }
4059         spin_unlock(&root->fs_info->trans_lock);
4060         btrfs_destroy_all_ordered_extents(root->fs_info);
4061         btrfs_destroy_delayed_inodes(root);
4062         btrfs_assert_delayed_root_empty(root);
4063         btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents);
4064         btrfs_destroy_all_delalloc_inodes(root->fs_info);
4065         mutex_unlock(&root->fs_info->transaction_kthread_mutex);
4066
4067         return 0;
4068 }
4069
4070 static struct extent_io_ops btree_extent_io_ops = {
4071         .readpage_end_io_hook = btree_readpage_end_io_hook,
4072         .readpage_io_failed_hook = btree_io_failed_hook,
4073         .submit_bio_hook = btree_submit_bio_hook,
4074         /* note we're sharing with inode.c for the merge bio hook */
4075         .merge_bio_hook = btrfs_merge_bio_hook,
4076 };