]> Pileus Git - ~andy/linux/blob - fs/btrfs/inode.c
Btrfs: join the transaction in __btrfs_setxattr
[~andy/linux] / fs / btrfs / inode.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/smp_lock.h>
30 #include <linux/backing-dev.h>
31 #include <linux/mpage.h>
32 #include <linux/swap.h>
33 #include <linux/writeback.h>
34 #include <linux/statfs.h>
35 #include <linux/compat.h>
36 #include <linux/bit_spinlock.h>
37 #include <linux/xattr.h>
38 #include <linux/posix_acl.h>
39 #include <linux/falloc.h>
40 #include "compat.h"
41 #include "ctree.h"
42 #include "disk-io.h"
43 #include "transaction.h"
44 #include "btrfs_inode.h"
45 #include "ioctl.h"
46 #include "print-tree.h"
47 #include "volumes.h"
48 #include "ordered-data.h"
49 #include "xattr.h"
50 #include "tree-log.h"
51 #include "ref-cache.h"
52 #include "compression.h"
53 #include "locking.h"
54
55 struct btrfs_iget_args {
56         u64 ino;
57         struct btrfs_root *root;
58 };
59
60 static struct inode_operations btrfs_dir_inode_operations;
61 static struct inode_operations btrfs_symlink_inode_operations;
62 static struct inode_operations btrfs_dir_ro_inode_operations;
63 static struct inode_operations btrfs_special_inode_operations;
64 static struct inode_operations btrfs_file_inode_operations;
65 static struct address_space_operations btrfs_aops;
66 static struct address_space_operations btrfs_symlink_aops;
67 static struct file_operations btrfs_dir_file_operations;
68 static struct extent_io_ops btrfs_extent_io_ops;
69
70 static struct kmem_cache *btrfs_inode_cachep;
71 struct kmem_cache *btrfs_trans_handle_cachep;
72 struct kmem_cache *btrfs_transaction_cachep;
73 struct kmem_cache *btrfs_bit_radix_cachep;
74 struct kmem_cache *btrfs_path_cachep;
75
76 #define S_SHIFT 12
77 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
78         [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
79         [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
80         [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
81         [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
82         [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
83         [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
84         [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
85 };
86
87 static void btrfs_truncate(struct inode *inode);
88 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
89 static noinline int cow_file_range(struct inode *inode,
90                                    struct page *locked_page,
91                                    u64 start, u64 end, int *page_started,
92                                    unsigned long *nr_written, int unlock);
93
94 static int btrfs_init_inode_security(struct inode *inode,  struct inode *dir)
95 {
96         int err;
97
98         err = btrfs_init_acl(inode, dir);
99         if (!err)
100                 err = btrfs_xattr_security_init(inode, dir);
101         return err;
102 }
103
104 /*
105  * a very lame attempt at stopping writes when the FS is 85% full.  There
106  * are countless ways this is incorrect, but it is better than nothing.
107  */
108 int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
109                            int for_del)
110 {
111         u64 total;
112         u64 used;
113         u64 thresh;
114         int ret = 0;
115
116         spin_lock(&root->fs_info->delalloc_lock);
117         total = btrfs_super_total_bytes(&root->fs_info->super_copy);
118         used = btrfs_super_bytes_used(&root->fs_info->super_copy);
119         if (for_del)
120                 thresh = total * 90;
121         else
122                 thresh = total * 85;
123
124         do_div(thresh, 100);
125
126         if (used + root->fs_info->delalloc_bytes + num_required > thresh)
127                 ret = -ENOSPC;
128         spin_unlock(&root->fs_info->delalloc_lock);
129         return ret;
130 }
131
132 /*
133  * this does all the hard work for inserting an inline extent into
134  * the btree.  The caller should have done a btrfs_drop_extents so that
135  * no overlapping inline items exist in the btree
136  */
137 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
138                                 struct btrfs_root *root, struct inode *inode,
139                                 u64 start, size_t size, size_t compressed_size,
140                                 struct page **compressed_pages)
141 {
142         struct btrfs_key key;
143         struct btrfs_path *path;
144         struct extent_buffer *leaf;
145         struct page *page = NULL;
146         char *kaddr;
147         unsigned long ptr;
148         struct btrfs_file_extent_item *ei;
149         int err = 0;
150         int ret;
151         size_t cur_size = size;
152         size_t datasize;
153         unsigned long offset;
154         int use_compress = 0;
155
156         if (compressed_size && compressed_pages) {
157                 use_compress = 1;
158                 cur_size = compressed_size;
159         }
160
161         path = btrfs_alloc_path();
162         if (!path)
163                 return -ENOMEM;
164
165         btrfs_set_trans_block_group(trans, inode);
166
167         key.objectid = inode->i_ino;
168         key.offset = start;
169         btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
170         datasize = btrfs_file_extent_calc_inline_size(cur_size);
171
172         inode_add_bytes(inode, size);
173         ret = btrfs_insert_empty_item(trans, root, path, &key,
174                                       datasize);
175         BUG_ON(ret);
176         if (ret) {
177                 err = ret;
178                 goto fail;
179         }
180         leaf = path->nodes[0];
181         ei = btrfs_item_ptr(leaf, path->slots[0],
182                             struct btrfs_file_extent_item);
183         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
184         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
185         btrfs_set_file_extent_encryption(leaf, ei, 0);
186         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
187         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
188         ptr = btrfs_file_extent_inline_start(ei);
189
190         if (use_compress) {
191                 struct page *cpage;
192                 int i = 0;
193                 while (compressed_size > 0) {
194                         cpage = compressed_pages[i];
195                         cur_size = min_t(unsigned long, compressed_size,
196                                        PAGE_CACHE_SIZE);
197
198                         kaddr = kmap(cpage);
199                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
200                         kunmap(cpage);
201
202                         i++;
203                         ptr += cur_size;
204                         compressed_size -= cur_size;
205                 }
206                 btrfs_set_file_extent_compression(leaf, ei,
207                                                   BTRFS_COMPRESS_ZLIB);
208         } else {
209                 page = find_get_page(inode->i_mapping,
210                                      start >> PAGE_CACHE_SHIFT);
211                 btrfs_set_file_extent_compression(leaf, ei, 0);
212                 kaddr = kmap_atomic(page, KM_USER0);
213                 offset = start & (PAGE_CACHE_SIZE - 1);
214                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
215                 kunmap_atomic(kaddr, KM_USER0);
216                 page_cache_release(page);
217         }
218         btrfs_mark_buffer_dirty(leaf);
219         btrfs_free_path(path);
220
221         BTRFS_I(inode)->disk_i_size = inode->i_size;
222         btrfs_update_inode(trans, root, inode);
223         return 0;
224 fail:
225         btrfs_free_path(path);
226         return err;
227 }
228
229
230 /*
231  * conditionally insert an inline extent into the file.  This
232  * does the checks required to make sure the data is small enough
233  * to fit as an inline extent.
234  */
235 static int cow_file_range_inline(struct btrfs_trans_handle *trans,
236                                  struct btrfs_root *root,
237                                  struct inode *inode, u64 start, u64 end,
238                                  size_t compressed_size,
239                                  struct page **compressed_pages)
240 {
241         u64 isize = i_size_read(inode);
242         u64 actual_end = min(end + 1, isize);
243         u64 inline_len = actual_end - start;
244         u64 aligned_end = (end + root->sectorsize - 1) &
245                         ~((u64)root->sectorsize - 1);
246         u64 hint_byte;
247         u64 data_len = inline_len;
248         int ret;
249
250         if (compressed_size)
251                 data_len = compressed_size;
252
253         if (start > 0 ||
254             actual_end >= PAGE_CACHE_SIZE ||
255             data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
256             (!compressed_size &&
257             (actual_end & (root->sectorsize - 1)) == 0) ||
258             end + 1 < isize ||
259             data_len > root->fs_info->max_inline) {
260                 return 1;
261         }
262
263         ret = btrfs_drop_extents(trans, root, inode, start,
264                                  aligned_end, start, &hint_byte);
265         BUG_ON(ret);
266
267         if (isize > actual_end)
268                 inline_len = min_t(u64, isize, actual_end);
269         ret = insert_inline_extent(trans, root, inode, start,
270                                    inline_len, compressed_size,
271                                    compressed_pages);
272         BUG_ON(ret);
273         btrfs_drop_extent_cache(inode, start, aligned_end, 0);
274         return 0;
275 }
276
277 struct async_extent {
278         u64 start;
279         u64 ram_size;
280         u64 compressed_size;
281         struct page **pages;
282         unsigned long nr_pages;
283         struct list_head list;
284 };
285
286 struct async_cow {
287         struct inode *inode;
288         struct btrfs_root *root;
289         struct page *locked_page;
290         u64 start;
291         u64 end;
292         struct list_head extents;
293         struct btrfs_work work;
294 };
295
296 static noinline int add_async_extent(struct async_cow *cow,
297                                      u64 start, u64 ram_size,
298                                      u64 compressed_size,
299                                      struct page **pages,
300                                      unsigned long nr_pages)
301 {
302         struct async_extent *async_extent;
303
304         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
305         async_extent->start = start;
306         async_extent->ram_size = ram_size;
307         async_extent->compressed_size = compressed_size;
308         async_extent->pages = pages;
309         async_extent->nr_pages = nr_pages;
310         list_add_tail(&async_extent->list, &cow->extents);
311         return 0;
312 }
313
314 /*
315  * we create compressed extents in two phases.  The first
316  * phase compresses a range of pages that have already been
317  * locked (both pages and state bits are locked).
318  *
319  * This is done inside an ordered work queue, and the compression
320  * is spread across many cpus.  The actual IO submission is step
321  * two, and the ordered work queue takes care of making sure that
322  * happens in the same order things were put onto the queue by
323  * writepages and friends.
324  *
325  * If this code finds it can't get good compression, it puts an
326  * entry onto the work queue to write the uncompressed bytes.  This
327  * makes sure that both compressed inodes and uncompressed inodes
328  * are written in the same order that pdflush sent them down.
329  */
330 static noinline int compress_file_range(struct inode *inode,
331                                         struct page *locked_page,
332                                         u64 start, u64 end,
333                                         struct async_cow *async_cow,
334                                         int *num_added)
335 {
336         struct btrfs_root *root = BTRFS_I(inode)->root;
337         struct btrfs_trans_handle *trans;
338         u64 num_bytes;
339         u64 orig_start;
340         u64 disk_num_bytes;
341         u64 blocksize = root->sectorsize;
342         u64 actual_end;
343         u64 isize = i_size_read(inode);
344         int ret = 0;
345         struct page **pages = NULL;
346         unsigned long nr_pages;
347         unsigned long nr_pages_ret = 0;
348         unsigned long total_compressed = 0;
349         unsigned long total_in = 0;
350         unsigned long max_compressed = 128 * 1024;
351         unsigned long max_uncompressed = 128 * 1024;
352         int i;
353         int will_compress;
354
355         orig_start = start;
356
357         actual_end = min_t(u64, isize, end + 1);
358 again:
359         will_compress = 0;
360         nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
361         nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
362
363         total_compressed = actual_end - start;
364
365         /* we want to make sure that amount of ram required to uncompress
366          * an extent is reasonable, so we limit the total size in ram
367          * of a compressed extent to 128k.  This is a crucial number
368          * because it also controls how easily we can spread reads across
369          * cpus for decompression.
370          *
371          * We also want to make sure the amount of IO required to do
372          * a random read is reasonably small, so we limit the size of
373          * a compressed extent to 128k.
374          */
375         total_compressed = min(total_compressed, max_uncompressed);
376         num_bytes = (end - start + blocksize) & ~(blocksize - 1);
377         num_bytes = max(blocksize,  num_bytes);
378         disk_num_bytes = num_bytes;
379         total_in = 0;
380         ret = 0;
381
382         /*
383          * we do compression for mount -o compress and when the
384          * inode has not been flagged as nocompress.  This flag can
385          * change at any time if we discover bad compression ratios.
386          */
387         if (!btrfs_test_flag(inode, NOCOMPRESS) &&
388             btrfs_test_opt(root, COMPRESS)) {
389                 WARN_ON(pages);
390                 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
391
392                 ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
393                                                 total_compressed, pages,
394                                                 nr_pages, &nr_pages_ret,
395                                                 &total_in,
396                                                 &total_compressed,
397                                                 max_compressed);
398
399                 if (!ret) {
400                         unsigned long offset = total_compressed &
401                                 (PAGE_CACHE_SIZE - 1);
402                         struct page *page = pages[nr_pages_ret - 1];
403                         char *kaddr;
404
405                         /* zero the tail end of the last page, we might be
406                          * sending it down to disk
407                          */
408                         if (offset) {
409                                 kaddr = kmap_atomic(page, KM_USER0);
410                                 memset(kaddr + offset, 0,
411                                        PAGE_CACHE_SIZE - offset);
412                                 kunmap_atomic(kaddr, KM_USER0);
413                         }
414                         will_compress = 1;
415                 }
416         }
417         if (start == 0) {
418                 trans = btrfs_join_transaction(root, 1);
419                 BUG_ON(!trans);
420                 btrfs_set_trans_block_group(trans, inode);
421
422                 /* lets try to make an inline extent */
423                 if (ret || total_in < (actual_end - start)) {
424                         /* we didn't compress the entire range, try
425                          * to make an uncompressed inline extent.
426                          */
427                         ret = cow_file_range_inline(trans, root, inode,
428                                                     start, end, 0, NULL);
429                 } else {
430                         /* try making a compressed inline extent */
431                         ret = cow_file_range_inline(trans, root, inode,
432                                                     start, end,
433                                                     total_compressed, pages);
434                 }
435                 btrfs_end_transaction(trans, root);
436                 if (ret == 0) {
437                         /*
438                          * inline extent creation worked, we don't need
439                          * to create any more async work items.  Unlock
440                          * and free up our temp pages.
441                          */
442                         extent_clear_unlock_delalloc(inode,
443                                                      &BTRFS_I(inode)->io_tree,
444                                                      start, end, NULL, 1, 0,
445                                                      0, 1, 1, 1);
446                         ret = 0;
447                         goto free_pages_out;
448                 }
449         }
450
451         if (will_compress) {
452                 /*
453                  * we aren't doing an inline extent round the compressed size
454                  * up to a block size boundary so the allocator does sane
455                  * things
456                  */
457                 total_compressed = (total_compressed + blocksize - 1) &
458                         ~(blocksize - 1);
459
460                 /*
461                  * one last check to make sure the compression is really a
462                  * win, compare the page count read with the blocks on disk
463                  */
464                 total_in = (total_in + PAGE_CACHE_SIZE - 1) &
465                         ~(PAGE_CACHE_SIZE - 1);
466                 if (total_compressed >= total_in) {
467                         will_compress = 0;
468                 } else {
469                         disk_num_bytes = total_compressed;
470                         num_bytes = total_in;
471                 }
472         }
473         if (!will_compress && pages) {
474                 /*
475                  * the compression code ran but failed to make things smaller,
476                  * free any pages it allocated and our page pointer array
477                  */
478                 for (i = 0; i < nr_pages_ret; i++) {
479                         WARN_ON(pages[i]->mapping);
480                         page_cache_release(pages[i]);
481                 }
482                 kfree(pages);
483                 pages = NULL;
484                 total_compressed = 0;
485                 nr_pages_ret = 0;
486
487                 /* flag the file so we don't compress in the future */
488                 btrfs_set_flag(inode, NOCOMPRESS);
489         }
490         if (will_compress) {
491                 *num_added += 1;
492
493                 /* the async work queues will take care of doing actual
494                  * allocation on disk for these compressed pages,
495                  * and will submit them to the elevator.
496                  */
497                 add_async_extent(async_cow, start, num_bytes,
498                                  total_compressed, pages, nr_pages_ret);
499
500                 if (start + num_bytes < end && start + num_bytes < actual_end) {
501                         start += num_bytes;
502                         pages = NULL;
503                         cond_resched();
504                         goto again;
505                 }
506         } else {
507                 /*
508                  * No compression, but we still need to write the pages in
509                  * the file we've been given so far.  redirty the locked
510                  * page if it corresponds to our extent and set things up
511                  * for the async work queue to run cow_file_range to do
512                  * the normal delalloc dance
513                  */
514                 if (page_offset(locked_page) >= start &&
515                     page_offset(locked_page) <= end) {
516                         __set_page_dirty_nobuffers(locked_page);
517                         /* unlocked later on in the async handlers */
518                 }
519                 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
520                 *num_added += 1;
521         }
522
523 out:
524         return 0;
525
526 free_pages_out:
527         for (i = 0; i < nr_pages_ret; i++) {
528                 WARN_ON(pages[i]->mapping);
529                 page_cache_release(pages[i]);
530         }
531         kfree(pages);
532
533         goto out;
534 }
535
536 /*
537  * phase two of compressed writeback.  This is the ordered portion
538  * of the code, which only gets called in the order the work was
539  * queued.  We walk all the async extents created by compress_file_range
540  * and send them down to the disk.
541  */
542 static noinline int submit_compressed_extents(struct inode *inode,
543                                               struct async_cow *async_cow)
544 {
545         struct async_extent *async_extent;
546         u64 alloc_hint = 0;
547         struct btrfs_trans_handle *trans;
548         struct btrfs_key ins;
549         struct extent_map *em;
550         struct btrfs_root *root = BTRFS_I(inode)->root;
551         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
552         struct extent_io_tree *io_tree;
553         int ret;
554
555         if (list_empty(&async_cow->extents))
556                 return 0;
557
558         trans = btrfs_join_transaction(root, 1);
559
560         while (!list_empty(&async_cow->extents)) {
561                 async_extent = list_entry(async_cow->extents.next,
562                                           struct async_extent, list);
563                 list_del(&async_extent->list);
564
565                 io_tree = &BTRFS_I(inode)->io_tree;
566
567                 /* did the compression code fall back to uncompressed IO? */
568                 if (!async_extent->pages) {
569                         int page_started = 0;
570                         unsigned long nr_written = 0;
571
572                         lock_extent(io_tree, async_extent->start,
573                                     async_extent->start +
574                                     async_extent->ram_size - 1, GFP_NOFS);
575
576                         /* allocate blocks */
577                         cow_file_range(inode, async_cow->locked_page,
578                                        async_extent->start,
579                                        async_extent->start +
580                                        async_extent->ram_size - 1,
581                                        &page_started, &nr_written, 0);
582
583                         /*
584                          * if page_started, cow_file_range inserted an
585                          * inline extent and took care of all the unlocking
586                          * and IO for us.  Otherwise, we need to submit
587                          * all those pages down to the drive.
588                          */
589                         if (!page_started)
590                                 extent_write_locked_range(io_tree,
591                                                   inode, async_extent->start,
592                                                   async_extent->start +
593                                                   async_extent->ram_size - 1,
594                                                   btrfs_get_extent,
595                                                   WB_SYNC_ALL);
596                         kfree(async_extent);
597                         cond_resched();
598                         continue;
599                 }
600
601                 lock_extent(io_tree, async_extent->start,
602                             async_extent->start + async_extent->ram_size - 1,
603                             GFP_NOFS);
604                 /*
605                  * here we're doing allocation and writeback of the
606                  * compressed pages
607                  */
608                 btrfs_drop_extent_cache(inode, async_extent->start,
609                                         async_extent->start +
610                                         async_extent->ram_size - 1, 0);
611
612                 ret = btrfs_reserve_extent(trans, root,
613                                            async_extent->compressed_size,
614                                            async_extent->compressed_size,
615                                            0, alloc_hint,
616                                            (u64)-1, &ins, 1);
617                 BUG_ON(ret);
618                 em = alloc_extent_map(GFP_NOFS);
619                 em->start = async_extent->start;
620                 em->len = async_extent->ram_size;
621                 em->orig_start = em->start;
622
623                 em->block_start = ins.objectid;
624                 em->block_len = ins.offset;
625                 em->bdev = root->fs_info->fs_devices->latest_bdev;
626                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
627                 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
628
629                 while (1) {
630                         spin_lock(&em_tree->lock);
631                         ret = add_extent_mapping(em_tree, em);
632                         spin_unlock(&em_tree->lock);
633                         if (ret != -EEXIST) {
634                                 free_extent_map(em);
635                                 break;
636                         }
637                         btrfs_drop_extent_cache(inode, async_extent->start,
638                                                 async_extent->start +
639                                                 async_extent->ram_size - 1, 0);
640                 }
641
642                 ret = btrfs_add_ordered_extent(inode, async_extent->start,
643                                                ins.objectid,
644                                                async_extent->ram_size,
645                                                ins.offset,
646                                                BTRFS_ORDERED_COMPRESSED);
647                 BUG_ON(ret);
648
649                 btrfs_end_transaction(trans, root);
650
651                 /*
652                  * clear dirty, set writeback and unlock the pages.
653                  */
654                 extent_clear_unlock_delalloc(inode,
655                                              &BTRFS_I(inode)->io_tree,
656                                              async_extent->start,
657                                              async_extent->start +
658                                              async_extent->ram_size - 1,
659                                              NULL, 1, 1, 0, 1, 1, 0);
660
661                 ret = btrfs_submit_compressed_write(inode,
662                                     async_extent->start,
663                                     async_extent->ram_size,
664                                     ins.objectid,
665                                     ins.offset, async_extent->pages,
666                                     async_extent->nr_pages);
667
668                 BUG_ON(ret);
669                 trans = btrfs_join_transaction(root, 1);
670                 alloc_hint = ins.objectid + ins.offset;
671                 kfree(async_extent);
672                 cond_resched();
673         }
674
675         btrfs_end_transaction(trans, root);
676         return 0;
677 }
678
679 /*
680  * when extent_io.c finds a delayed allocation range in the file,
681  * the call backs end up in this code.  The basic idea is to
682  * allocate extents on disk for the range, and create ordered data structs
683  * in ram to track those extents.
684  *
685  * locked_page is the page that writepage had locked already.  We use
686  * it to make sure we don't do extra locks or unlocks.
687  *
688  * *page_started is set to one if we unlock locked_page and do everything
689  * required to start IO on it.  It may be clean and already done with
690  * IO when we return.
691  */
692 static noinline int cow_file_range(struct inode *inode,
693                                    struct page *locked_page,
694                                    u64 start, u64 end, int *page_started,
695                                    unsigned long *nr_written,
696                                    int unlock)
697 {
698         struct btrfs_root *root = BTRFS_I(inode)->root;
699         struct btrfs_trans_handle *trans;
700         u64 alloc_hint = 0;
701         u64 num_bytes;
702         unsigned long ram_size;
703         u64 disk_num_bytes;
704         u64 cur_alloc_size;
705         u64 blocksize = root->sectorsize;
706         u64 actual_end;
707         u64 isize = i_size_read(inode);
708         struct btrfs_key ins;
709         struct extent_map *em;
710         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
711         int ret = 0;
712
713         trans = btrfs_join_transaction(root, 1);
714         BUG_ON(!trans);
715         btrfs_set_trans_block_group(trans, inode);
716
717         actual_end = min_t(u64, isize, end + 1);
718
719         num_bytes = (end - start + blocksize) & ~(blocksize - 1);
720         num_bytes = max(blocksize,  num_bytes);
721         disk_num_bytes = num_bytes;
722         ret = 0;
723
724         if (start == 0) {
725                 /* lets try to make an inline extent */
726                 ret = cow_file_range_inline(trans, root, inode,
727                                             start, end, 0, NULL);
728                 if (ret == 0) {
729                         extent_clear_unlock_delalloc(inode,
730                                                      &BTRFS_I(inode)->io_tree,
731                                                      start, end, NULL, 1, 1,
732                                                      1, 1, 1, 1);
733                         *nr_written = *nr_written +
734                              (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
735                         *page_started = 1;
736                         ret = 0;
737                         goto out;
738                 }
739         }
740
741         BUG_ON(disk_num_bytes >
742                btrfs_super_total_bytes(&root->fs_info->super_copy));
743
744         btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
745
746         while (disk_num_bytes > 0) {
747                 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
748                 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
749                                            root->sectorsize, 0, alloc_hint,
750                                            (u64)-1, &ins, 1);
751                 BUG_ON(ret);
752
753                 em = alloc_extent_map(GFP_NOFS);
754                 em->start = start;
755                 em->orig_start = em->start;
756
757                 ram_size = ins.offset;
758                 em->len = ins.offset;
759
760                 em->block_start = ins.objectid;
761                 em->block_len = ins.offset;
762                 em->bdev = root->fs_info->fs_devices->latest_bdev;
763                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
764
765                 while (1) {
766                         spin_lock(&em_tree->lock);
767                         ret = add_extent_mapping(em_tree, em);
768                         spin_unlock(&em_tree->lock);
769                         if (ret != -EEXIST) {
770                                 free_extent_map(em);
771                                 break;
772                         }
773                         btrfs_drop_extent_cache(inode, start,
774                                                 start + ram_size - 1, 0);
775                 }
776
777                 cur_alloc_size = ins.offset;
778                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
779                                                ram_size, cur_alloc_size, 0);
780                 BUG_ON(ret);
781
782                 if (root->root_key.objectid ==
783                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
784                         ret = btrfs_reloc_clone_csums(inode, start,
785                                                       cur_alloc_size);
786                         BUG_ON(ret);
787                 }
788
789                 if (disk_num_bytes < cur_alloc_size)
790                         break;
791
792                 /* we're not doing compressed IO, don't unlock the first
793                  * page (which the caller expects to stay locked), don't
794                  * clear any dirty bits and don't set any writeback bits
795                  */
796                 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
797                                              start, start + ram_size - 1,
798                                              locked_page, unlock, 1,
799                                              1, 0, 0, 0);
800                 disk_num_bytes -= cur_alloc_size;
801                 num_bytes -= cur_alloc_size;
802                 alloc_hint = ins.objectid + ins.offset;
803                 start += cur_alloc_size;
804         }
805 out:
806         ret = 0;
807         btrfs_end_transaction(trans, root);
808
809         return ret;
810 }
811
812 /*
813  * work queue call back to started compression on a file and pages
814  */
815 static noinline void async_cow_start(struct btrfs_work *work)
816 {
817         struct async_cow *async_cow;
818         int num_added = 0;
819         async_cow = container_of(work, struct async_cow, work);
820
821         compress_file_range(async_cow->inode, async_cow->locked_page,
822                             async_cow->start, async_cow->end, async_cow,
823                             &num_added);
824         if (num_added == 0)
825                 async_cow->inode = NULL;
826 }
827
828 /*
829  * work queue call back to submit previously compressed pages
830  */
831 static noinline void async_cow_submit(struct btrfs_work *work)
832 {
833         struct async_cow *async_cow;
834         struct btrfs_root *root;
835         unsigned long nr_pages;
836
837         async_cow = container_of(work, struct async_cow, work);
838
839         root = async_cow->root;
840         nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
841                 PAGE_CACHE_SHIFT;
842
843         atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
844
845         if (atomic_read(&root->fs_info->async_delalloc_pages) <
846             5 * 1042 * 1024 &&
847             waitqueue_active(&root->fs_info->async_submit_wait))
848                 wake_up(&root->fs_info->async_submit_wait);
849
850         if (async_cow->inode)
851                 submit_compressed_extents(async_cow->inode, async_cow);
852 }
853
854 static noinline void async_cow_free(struct btrfs_work *work)
855 {
856         struct async_cow *async_cow;
857         async_cow = container_of(work, struct async_cow, work);
858         kfree(async_cow);
859 }
860
861 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
862                                 u64 start, u64 end, int *page_started,
863                                 unsigned long *nr_written)
864 {
865         struct async_cow *async_cow;
866         struct btrfs_root *root = BTRFS_I(inode)->root;
867         unsigned long nr_pages;
868         u64 cur_end;
869         int limit = 10 * 1024 * 1042;
870
871         if (!btrfs_test_opt(root, COMPRESS)) {
872                 return cow_file_range(inode, locked_page, start, end,
873                                       page_started, nr_written, 1);
874         }
875
876         clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
877                          EXTENT_DELALLOC, 1, 0, GFP_NOFS);
878         while (start < end) {
879                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
880                 async_cow->inode = inode;
881                 async_cow->root = root;
882                 async_cow->locked_page = locked_page;
883                 async_cow->start = start;
884
885                 if (btrfs_test_flag(inode, NOCOMPRESS))
886                         cur_end = end;
887                 else
888                         cur_end = min(end, start + 512 * 1024 - 1);
889
890                 async_cow->end = cur_end;
891                 INIT_LIST_HEAD(&async_cow->extents);
892
893                 async_cow->work.func = async_cow_start;
894                 async_cow->work.ordered_func = async_cow_submit;
895                 async_cow->work.ordered_free = async_cow_free;
896                 async_cow->work.flags = 0;
897
898                 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
899                         PAGE_CACHE_SHIFT;
900                 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
901
902                 btrfs_queue_worker(&root->fs_info->delalloc_workers,
903                                    &async_cow->work);
904
905                 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
906                         wait_event(root->fs_info->async_submit_wait,
907                            (atomic_read(&root->fs_info->async_delalloc_pages) <
908                             limit));
909                 }
910
911                 while (atomic_read(&root->fs_info->async_submit_draining) &&
912                       atomic_read(&root->fs_info->async_delalloc_pages)) {
913                         wait_event(root->fs_info->async_submit_wait,
914                           (atomic_read(&root->fs_info->async_delalloc_pages) ==
915                            0));
916                 }
917
918                 *nr_written += nr_pages;
919                 start = cur_end + 1;
920         }
921         *page_started = 1;
922         return 0;
923 }
924
925 static noinline int csum_exist_in_range(struct btrfs_root *root,
926                                         u64 bytenr, u64 num_bytes)
927 {
928         int ret;
929         struct btrfs_ordered_sum *sums;
930         LIST_HEAD(list);
931
932         ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
933                                        bytenr + num_bytes - 1, &list);
934         if (ret == 0 && list_empty(&list))
935                 return 0;
936
937         while (!list_empty(&list)) {
938                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
939                 list_del(&sums->list);
940                 kfree(sums);
941         }
942         return 1;
943 }
944
945 /*
946  * when nowcow writeback call back.  This checks for snapshots or COW copies
947  * of the extents that exist in the file, and COWs the file as required.
948  *
949  * If no cow copies or snapshots exist, we write directly to the existing
950  * blocks on disk
951  */
952 static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
953                               u64 start, u64 end, int *page_started, int force,
954                               unsigned long *nr_written)
955 {
956         struct btrfs_root *root = BTRFS_I(inode)->root;
957         struct btrfs_trans_handle *trans;
958         struct extent_buffer *leaf;
959         struct btrfs_path *path;
960         struct btrfs_file_extent_item *fi;
961         struct btrfs_key found_key;
962         u64 cow_start;
963         u64 cur_offset;
964         u64 extent_end;
965         u64 disk_bytenr;
966         u64 num_bytes;
967         int extent_type;
968         int ret;
969         int type;
970         int nocow;
971         int check_prev = 1;
972
973         path = btrfs_alloc_path();
974         BUG_ON(!path);
975         trans = btrfs_join_transaction(root, 1);
976         BUG_ON(!trans);
977
978         cow_start = (u64)-1;
979         cur_offset = start;
980         while (1) {
981                 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
982                                                cur_offset, 0);
983                 BUG_ON(ret < 0);
984                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
985                         leaf = path->nodes[0];
986                         btrfs_item_key_to_cpu(leaf, &found_key,
987                                               path->slots[0] - 1);
988                         if (found_key.objectid == inode->i_ino &&
989                             found_key.type == BTRFS_EXTENT_DATA_KEY)
990                                 path->slots[0]--;
991                 }
992                 check_prev = 0;
993 next_slot:
994                 leaf = path->nodes[0];
995                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
996                         ret = btrfs_next_leaf(root, path);
997                         if (ret < 0)
998                                 BUG_ON(1);
999                         if (ret > 0)
1000                                 break;
1001                         leaf = path->nodes[0];
1002                 }
1003
1004                 nocow = 0;
1005                 disk_bytenr = 0;
1006                 num_bytes = 0;
1007                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1008
1009                 if (found_key.objectid > inode->i_ino ||
1010                     found_key.type > BTRFS_EXTENT_DATA_KEY ||
1011                     found_key.offset > end)
1012                         break;
1013
1014                 if (found_key.offset > cur_offset) {
1015                         extent_end = found_key.offset;
1016                         goto out_check;
1017                 }
1018
1019                 fi = btrfs_item_ptr(leaf, path->slots[0],
1020                                     struct btrfs_file_extent_item);
1021                 extent_type = btrfs_file_extent_type(leaf, fi);
1022
1023                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1024                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1025                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1026                         extent_end = found_key.offset +
1027                                 btrfs_file_extent_num_bytes(leaf, fi);
1028                         if (extent_end <= start) {
1029                                 path->slots[0]++;
1030                                 goto next_slot;
1031                         }
1032                         if (disk_bytenr == 0)
1033                                 goto out_check;
1034                         if (btrfs_file_extent_compression(leaf, fi) ||
1035                             btrfs_file_extent_encryption(leaf, fi) ||
1036                             btrfs_file_extent_other_encoding(leaf, fi))
1037                                 goto out_check;
1038                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1039                                 goto out_check;
1040                         if (btrfs_extent_readonly(root, disk_bytenr))
1041                                 goto out_check;
1042                         if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
1043                                                   disk_bytenr))
1044                                 goto out_check;
1045                         disk_bytenr += btrfs_file_extent_offset(leaf, fi);
1046                         disk_bytenr += cur_offset - found_key.offset;
1047                         num_bytes = min(end + 1, extent_end) - cur_offset;
1048                         /*
1049                          * force cow if csum exists in the range.
1050                          * this ensure that csum for a given extent are
1051                          * either valid or do not exist.
1052                          */
1053                         if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1054                                 goto out_check;
1055                         nocow = 1;
1056                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1057                         extent_end = found_key.offset +
1058                                 btrfs_file_extent_inline_len(leaf, fi);
1059                         extent_end = ALIGN(extent_end, root->sectorsize);
1060                 } else {
1061                         BUG_ON(1);
1062                 }
1063 out_check:
1064                 if (extent_end <= start) {
1065                         path->slots[0]++;
1066                         goto next_slot;
1067                 }
1068                 if (!nocow) {
1069                         if (cow_start == (u64)-1)
1070                                 cow_start = cur_offset;
1071                         cur_offset = extent_end;
1072                         if (cur_offset > end)
1073                                 break;
1074                         path->slots[0]++;
1075                         goto next_slot;
1076                 }
1077
1078                 btrfs_release_path(root, path);
1079                 if (cow_start != (u64)-1) {
1080                         ret = cow_file_range(inode, locked_page, cow_start,
1081                                         found_key.offset - 1, page_started,
1082                                         nr_written, 1);
1083                         BUG_ON(ret);
1084                         cow_start = (u64)-1;
1085                 }
1086
1087                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1088                         struct extent_map *em;
1089                         struct extent_map_tree *em_tree;
1090                         em_tree = &BTRFS_I(inode)->extent_tree;
1091                         em = alloc_extent_map(GFP_NOFS);
1092                         em->start = cur_offset;
1093                         em->orig_start = em->start;
1094                         em->len = num_bytes;
1095                         em->block_len = num_bytes;
1096                         em->block_start = disk_bytenr;
1097                         em->bdev = root->fs_info->fs_devices->latest_bdev;
1098                         set_bit(EXTENT_FLAG_PINNED, &em->flags);
1099                         while (1) {
1100                                 spin_lock(&em_tree->lock);
1101                                 ret = add_extent_mapping(em_tree, em);
1102                                 spin_unlock(&em_tree->lock);
1103                                 if (ret != -EEXIST) {
1104                                         free_extent_map(em);
1105                                         break;
1106                                 }
1107                                 btrfs_drop_extent_cache(inode, em->start,
1108                                                 em->start + em->len - 1, 0);
1109                         }
1110                         type = BTRFS_ORDERED_PREALLOC;
1111                 } else {
1112                         type = BTRFS_ORDERED_NOCOW;
1113                 }
1114
1115                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1116                                                num_bytes, num_bytes, type);
1117                 BUG_ON(ret);
1118
1119                 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1120                                         cur_offset, cur_offset + num_bytes - 1,
1121                                         locked_page, 1, 1, 1, 0, 0, 0);
1122                 cur_offset = extent_end;
1123                 if (cur_offset > end)
1124                         break;
1125         }
1126         btrfs_release_path(root, path);
1127
1128         if (cur_offset <= end && cow_start == (u64)-1)
1129                 cow_start = cur_offset;
1130         if (cow_start != (u64)-1) {
1131                 ret = cow_file_range(inode, locked_page, cow_start, end,
1132                                      page_started, nr_written, 1);
1133                 BUG_ON(ret);
1134         }
1135
1136         ret = btrfs_end_transaction(trans, root);
1137         BUG_ON(ret);
1138         btrfs_free_path(path);
1139         return 0;
1140 }
1141
1142 /*
1143  * extent_io.c call back to do delayed allocation processing
1144  */
1145 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1146                               u64 start, u64 end, int *page_started,
1147                               unsigned long *nr_written)
1148 {
1149         int ret;
1150
1151         if (btrfs_test_flag(inode, NODATACOW))
1152                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1153                                          page_started, 1, nr_written);
1154         else if (btrfs_test_flag(inode, PREALLOC))
1155                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1156                                          page_started, 0, nr_written);
1157         else
1158                 ret = cow_file_range_async(inode, locked_page, start, end,
1159                                            page_started, nr_written);
1160
1161         return ret;
1162 }
1163
1164 /*
1165  * extent_io.c set_bit_hook, used to track delayed allocation
1166  * bytes in this file, and to maintain the list of inodes that
1167  * have pending delalloc work to be done.
1168  */
1169 static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1170                        unsigned long old, unsigned long bits)
1171 {
1172         /*
1173          * set_bit and clear bit hooks normally require _irqsave/restore
1174          * but in this case, we are only testeing for the DELALLOC
1175          * bit, which is only set or cleared with irqs on
1176          */
1177         if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1178                 struct btrfs_root *root = BTRFS_I(inode)->root;
1179                 spin_lock(&root->fs_info->delalloc_lock);
1180                 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1181                 root->fs_info->delalloc_bytes += end - start + 1;
1182                 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1183                         list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1184                                       &root->fs_info->delalloc_inodes);
1185                 }
1186                 spin_unlock(&root->fs_info->delalloc_lock);
1187         }
1188         return 0;
1189 }
1190
1191 /*
1192  * extent_io.c clear_bit_hook, see set_bit_hook for why
1193  */
1194 static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1195                          unsigned long old, unsigned long bits)
1196 {
1197         /*
1198          * set_bit and clear bit hooks normally require _irqsave/restore
1199          * but in this case, we are only testeing for the DELALLOC
1200          * bit, which is only set or cleared with irqs on
1201          */
1202         if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1203                 struct btrfs_root *root = BTRFS_I(inode)->root;
1204
1205                 spin_lock(&root->fs_info->delalloc_lock);
1206                 if (end - start + 1 > root->fs_info->delalloc_bytes) {
1207                         printk(KERN_INFO "btrfs warning: delalloc account "
1208                                "%llu %llu\n",
1209                                (unsigned long long)end - start + 1,
1210                                (unsigned long long)
1211                                root->fs_info->delalloc_bytes);
1212                         root->fs_info->delalloc_bytes = 0;
1213                         BTRFS_I(inode)->delalloc_bytes = 0;
1214                 } else {
1215                         root->fs_info->delalloc_bytes -= end - start + 1;
1216                         BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
1217                 }
1218                 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1219                     !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1220                         list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1221                 }
1222                 spin_unlock(&root->fs_info->delalloc_lock);
1223         }
1224         return 0;
1225 }
1226
1227 /*
1228  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1229  * we don't create bios that span stripes or chunks
1230  */
1231 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1232                          size_t size, struct bio *bio,
1233                          unsigned long bio_flags)
1234 {
1235         struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1236         struct btrfs_mapping_tree *map_tree;
1237         u64 logical = (u64)bio->bi_sector << 9;
1238         u64 length = 0;
1239         u64 map_length;
1240         int ret;
1241
1242         if (bio_flags & EXTENT_BIO_COMPRESSED)
1243                 return 0;
1244
1245         length = bio->bi_size;
1246         map_tree = &root->fs_info->mapping_tree;
1247         map_length = length;
1248         ret = btrfs_map_block(map_tree, READ, logical,
1249                               &map_length, NULL, 0);
1250
1251         if (map_length < length + size)
1252                 return 1;
1253         return 0;
1254 }
1255
1256 /*
1257  * in order to insert checksums into the metadata in large chunks,
1258  * we wait until bio submission time.   All the pages in the bio are
1259  * checksummed and sums are attached onto the ordered extent record.
1260  *
1261  * At IO completion time the cums attached on the ordered extent record
1262  * are inserted into the btree
1263  */
1264 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1265                                     struct bio *bio, int mirror_num,
1266                                     unsigned long bio_flags)
1267 {
1268         struct btrfs_root *root = BTRFS_I(inode)->root;
1269         int ret = 0;
1270
1271         ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1272         BUG_ON(ret);
1273         return 0;
1274 }
1275
1276 /*
1277  * in order to insert checksums into the metadata in large chunks,
1278  * we wait until bio submission time.   All the pages in the bio are
1279  * checksummed and sums are attached onto the ordered extent record.
1280  *
1281  * At IO completion time the cums attached on the ordered extent record
1282  * are inserted into the btree
1283  */
1284 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1285                           int mirror_num, unsigned long bio_flags)
1286 {
1287         struct btrfs_root *root = BTRFS_I(inode)->root;
1288         return btrfs_map_bio(root, rw, bio, mirror_num, 1);
1289 }
1290
1291 /*
1292  * extent_io.c submission hook. This does the right thing for csum calculation
1293  * on write, or reading the csums from the tree before a read
1294  */
1295 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1296                           int mirror_num, unsigned long bio_flags)
1297 {
1298         struct btrfs_root *root = BTRFS_I(inode)->root;
1299         int ret = 0;
1300         int skip_sum;
1301
1302         skip_sum = btrfs_test_flag(inode, NODATASUM);
1303
1304         ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1305         BUG_ON(ret);
1306
1307         if (!(rw & (1 << BIO_RW))) {
1308                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1309                         return btrfs_submit_compressed_read(inode, bio,
1310                                                     mirror_num, bio_flags);
1311                 } else if (!skip_sum)
1312                         btrfs_lookup_bio_sums(root, inode, bio, NULL);
1313                 goto mapit;
1314         } else if (!skip_sum) {
1315                 /* csum items have already been cloned */
1316                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1317                         goto mapit;
1318                 /* we're doing a write, do the async checksumming */
1319                 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1320                                    inode, rw, bio, mirror_num,
1321                                    bio_flags, __btrfs_submit_bio_start,
1322                                    __btrfs_submit_bio_done);
1323         }
1324
1325 mapit:
1326         return btrfs_map_bio(root, rw, bio, mirror_num, 0);
1327 }
1328
1329 /*
1330  * given a list of ordered sums record them in the inode.  This happens
1331  * at IO completion time based on sums calculated at bio submission time.
1332  */
1333 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1334                              struct inode *inode, u64 file_offset,
1335                              struct list_head *list)
1336 {
1337         struct btrfs_ordered_sum *sum;
1338
1339         btrfs_set_trans_block_group(trans, inode);
1340
1341         list_for_each_entry(sum, list, list) {
1342                 btrfs_csum_file_blocks(trans,
1343                        BTRFS_I(inode)->root->fs_info->csum_root, sum);
1344         }
1345         return 0;
1346 }
1347
1348 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
1349 {
1350         if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
1351                 WARN_ON(1);
1352         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1353                                    GFP_NOFS);
1354 }
1355
1356 /* see btrfs_writepage_start_hook for details on why this is required */
1357 struct btrfs_writepage_fixup {
1358         struct page *page;
1359         struct btrfs_work work;
1360 };
1361
1362 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1363 {
1364         struct btrfs_writepage_fixup *fixup;
1365         struct btrfs_ordered_extent *ordered;
1366         struct page *page;
1367         struct inode *inode;
1368         u64 page_start;
1369         u64 page_end;
1370
1371         fixup = container_of(work, struct btrfs_writepage_fixup, work);
1372         page = fixup->page;
1373 again:
1374         lock_page(page);
1375         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1376                 ClearPageChecked(page);
1377                 goto out_page;
1378         }
1379
1380         inode = page->mapping->host;
1381         page_start = page_offset(page);
1382         page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1383
1384         lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1385
1386         /* already ordered? We're done */
1387         if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
1388                              EXTENT_ORDERED, 0)) {
1389                 goto out;
1390         }
1391
1392         ordered = btrfs_lookup_ordered_extent(inode, page_start);
1393         if (ordered) {
1394                 unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
1395                               page_end, GFP_NOFS);
1396                 unlock_page(page);
1397                 btrfs_start_ordered_extent(inode, ordered, 1);
1398                 goto again;
1399         }
1400
1401         btrfs_set_extent_delalloc(inode, page_start, page_end);
1402         ClearPageChecked(page);
1403 out:
1404         unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1405 out_page:
1406         unlock_page(page);
1407         page_cache_release(page);
1408 }
1409
1410 /*
1411  * There are a few paths in the higher layers of the kernel that directly
1412  * set the page dirty bit without asking the filesystem if it is a
1413  * good idea.  This causes problems because we want to make sure COW
1414  * properly happens and the data=ordered rules are followed.
1415  *
1416  * In our case any range that doesn't have the ORDERED bit set
1417  * hasn't been properly setup for IO.  We kick off an async process
1418  * to fix it up.  The async helper will wait for ordered extents, set
1419  * the delalloc bit and make it safe to write the page.
1420  */
1421 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1422 {
1423         struct inode *inode = page->mapping->host;
1424         struct btrfs_writepage_fixup *fixup;
1425         struct btrfs_root *root = BTRFS_I(inode)->root;
1426         int ret;
1427
1428         ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1429                              EXTENT_ORDERED, 0);
1430         if (ret)
1431                 return 0;
1432
1433         if (PageChecked(page))
1434                 return -EAGAIN;
1435
1436         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1437         if (!fixup)
1438                 return -EAGAIN;
1439
1440         SetPageChecked(page);
1441         page_cache_get(page);
1442         fixup->work.func = btrfs_writepage_fixup_worker;
1443         fixup->page = page;
1444         btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
1445         return -EAGAIN;
1446 }
1447
1448 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1449                                        struct inode *inode, u64 file_pos,
1450                                        u64 disk_bytenr, u64 disk_num_bytes,
1451                                        u64 num_bytes, u64 ram_bytes,
1452                                        u8 compression, u8 encryption,
1453                                        u16 other_encoding, int extent_type)
1454 {
1455         struct btrfs_root *root = BTRFS_I(inode)->root;
1456         struct btrfs_file_extent_item *fi;
1457         struct btrfs_path *path;
1458         struct extent_buffer *leaf;
1459         struct btrfs_key ins;
1460         u64 hint;
1461         int ret;
1462
1463         path = btrfs_alloc_path();
1464         BUG_ON(!path);
1465
1466         ret = btrfs_drop_extents(trans, root, inode, file_pos,
1467                                  file_pos + num_bytes, file_pos, &hint);
1468         BUG_ON(ret);
1469
1470         ins.objectid = inode->i_ino;
1471         ins.offset = file_pos;
1472         ins.type = BTRFS_EXTENT_DATA_KEY;
1473         ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
1474         BUG_ON(ret);
1475         leaf = path->nodes[0];
1476         fi = btrfs_item_ptr(leaf, path->slots[0],
1477                             struct btrfs_file_extent_item);
1478         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1479         btrfs_set_file_extent_type(leaf, fi, extent_type);
1480         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1481         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1482         btrfs_set_file_extent_offset(leaf, fi, 0);
1483         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1484         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1485         btrfs_set_file_extent_compression(leaf, fi, compression);
1486         btrfs_set_file_extent_encryption(leaf, fi, encryption);
1487         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1488         btrfs_mark_buffer_dirty(leaf);
1489
1490         inode_add_bytes(inode, num_bytes);
1491         btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
1492
1493         ins.objectid = disk_bytenr;
1494         ins.offset = disk_num_bytes;
1495         ins.type = BTRFS_EXTENT_ITEM_KEY;
1496         ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
1497                                           root->root_key.objectid,
1498                                           trans->transid, inode->i_ino, &ins);
1499         BUG_ON(ret);
1500
1501         btrfs_free_path(path);
1502         return 0;
1503 }
1504
1505 /* as ordered data IO finishes, this gets called so we can finish
1506  * an ordered extent if the range of bytes in the file it covers are
1507  * fully written.
1508  */
1509 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1510 {
1511         struct btrfs_root *root = BTRFS_I(inode)->root;
1512         struct btrfs_trans_handle *trans;
1513         struct btrfs_ordered_extent *ordered_extent;
1514         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1515         int compressed = 0;
1516         int ret;
1517
1518         ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
1519         if (!ret)
1520                 return 0;
1521
1522         trans = btrfs_join_transaction(root, 1);
1523
1524         ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1525         BUG_ON(!ordered_extent);
1526         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
1527                 goto nocow;
1528
1529         lock_extent(io_tree, ordered_extent->file_offset,
1530                     ordered_extent->file_offset + ordered_extent->len - 1,
1531                     GFP_NOFS);
1532
1533         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1534                 compressed = 1;
1535         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1536                 BUG_ON(compressed);
1537                 ret = btrfs_mark_extent_written(trans, root, inode,
1538                                                 ordered_extent->file_offset,
1539                                                 ordered_extent->file_offset +
1540                                                 ordered_extent->len);
1541                 BUG_ON(ret);
1542         } else {
1543                 ret = insert_reserved_file_extent(trans, inode,
1544                                                 ordered_extent->file_offset,
1545                                                 ordered_extent->start,
1546                                                 ordered_extent->disk_len,
1547                                                 ordered_extent->len,
1548                                                 ordered_extent->len,
1549                                                 compressed, 0, 0,
1550                                                 BTRFS_FILE_EXTENT_REG);
1551                 BUG_ON(ret);
1552         }
1553         unlock_extent(io_tree, ordered_extent->file_offset,
1554                     ordered_extent->file_offset + ordered_extent->len - 1,
1555                     GFP_NOFS);
1556 nocow:
1557         add_pending_csums(trans, inode, ordered_extent->file_offset,
1558                           &ordered_extent->list);
1559
1560         mutex_lock(&BTRFS_I(inode)->extent_mutex);
1561         btrfs_ordered_update_i_size(inode, ordered_extent);
1562         btrfs_update_inode(trans, root, inode);
1563         btrfs_remove_ordered_extent(inode, ordered_extent);
1564         mutex_unlock(&BTRFS_I(inode)->extent_mutex);
1565
1566         /* once for us */
1567         btrfs_put_ordered_extent(ordered_extent);
1568         /* once for the tree */
1569         btrfs_put_ordered_extent(ordered_extent);
1570
1571         btrfs_end_transaction(trans, root);
1572         return 0;
1573 }
1574
1575 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1576                                 struct extent_state *state, int uptodate)
1577 {
1578         return btrfs_finish_ordered_io(page->mapping->host, start, end);
1579 }
1580
1581 /*
1582  * When IO fails, either with EIO or csum verification fails, we
1583  * try other mirrors that might have a good copy of the data.  This
1584  * io_failure_record is used to record state as we go through all the
1585  * mirrors.  If another mirror has good data, the page is set up to date
1586  * and things continue.  If a good mirror can't be found, the original
1587  * bio end_io callback is called to indicate things have failed.
1588  */
1589 struct io_failure_record {
1590         struct page *page;
1591         u64 start;
1592         u64 len;
1593         u64 logical;
1594         unsigned long bio_flags;
1595         int last_mirror;
1596 };
1597
1598 static int btrfs_io_failed_hook(struct bio *failed_bio,
1599                          struct page *page, u64 start, u64 end,
1600                          struct extent_state *state)
1601 {
1602         struct io_failure_record *failrec = NULL;
1603         u64 private;
1604         struct extent_map *em;
1605         struct inode *inode = page->mapping->host;
1606         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1607         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1608         struct bio *bio;
1609         int num_copies;
1610         int ret;
1611         int rw;
1612         u64 logical;
1613
1614         ret = get_state_private(failure_tree, start, &private);
1615         if (ret) {
1616                 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1617                 if (!failrec)
1618                         return -ENOMEM;
1619                 failrec->start = start;
1620                 failrec->len = end - start + 1;
1621                 failrec->last_mirror = 0;
1622                 failrec->bio_flags = 0;
1623
1624                 spin_lock(&em_tree->lock);
1625                 em = lookup_extent_mapping(em_tree, start, failrec->len);
1626                 if (em->start > start || em->start + em->len < start) {
1627                         free_extent_map(em);
1628                         em = NULL;
1629                 }
1630                 spin_unlock(&em_tree->lock);
1631
1632                 if (!em || IS_ERR(em)) {
1633                         kfree(failrec);
1634                         return -EIO;
1635                 }
1636                 logical = start - em->start;
1637                 logical = em->block_start + logical;
1638                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1639                         logical = em->block_start;
1640                         failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1641                 }
1642                 failrec->logical = logical;
1643                 free_extent_map(em);
1644                 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1645                                 EXTENT_DIRTY, GFP_NOFS);
1646                 set_state_private(failure_tree, start,
1647                                  (u64)(unsigned long)failrec);
1648         } else {
1649                 failrec = (struct io_failure_record *)(unsigned long)private;
1650         }
1651         num_copies = btrfs_num_copies(
1652                               &BTRFS_I(inode)->root->fs_info->mapping_tree,
1653                               failrec->logical, failrec->len);
1654         failrec->last_mirror++;
1655         if (!state) {
1656                 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1657                 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1658                                                     failrec->start,
1659                                                     EXTENT_LOCKED);
1660                 if (state && state->start != failrec->start)
1661                         state = NULL;
1662                 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1663         }
1664         if (!state || failrec->last_mirror > num_copies) {
1665                 set_state_private(failure_tree, failrec->start, 0);
1666                 clear_extent_bits(failure_tree, failrec->start,
1667                                   failrec->start + failrec->len - 1,
1668                                   EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1669                 kfree(failrec);
1670                 return -EIO;
1671         }
1672         bio = bio_alloc(GFP_NOFS, 1);
1673         bio->bi_private = state;
1674         bio->bi_end_io = failed_bio->bi_end_io;
1675         bio->bi_sector = failrec->logical >> 9;
1676         bio->bi_bdev = failed_bio->bi_bdev;
1677         bio->bi_size = 0;
1678
1679         bio_add_page(bio, page, failrec->len, start - page_offset(page));
1680         if (failed_bio->bi_rw & (1 << BIO_RW))
1681                 rw = WRITE;
1682         else
1683                 rw = READ;
1684
1685         BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1686                                                       failrec->last_mirror,
1687                                                       failrec->bio_flags);
1688         return 0;
1689 }
1690
1691 /*
1692  * each time an IO finishes, we do a fast check in the IO failure tree
1693  * to see if we need to process or clean up an io_failure_record
1694  */
1695 static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1696 {
1697         u64 private;
1698         u64 private_failure;
1699         struct io_failure_record *failure;
1700         int ret;
1701
1702         private = 0;
1703         if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1704                              (u64)-1, 1, EXTENT_DIRTY)) {
1705                 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1706                                         start, &private_failure);
1707                 if (ret == 0) {
1708                         failure = (struct io_failure_record *)(unsigned long)
1709                                    private_failure;
1710                         set_state_private(&BTRFS_I(inode)->io_failure_tree,
1711                                           failure->start, 0);
1712                         clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1713                                           failure->start,
1714                                           failure->start + failure->len - 1,
1715                                           EXTENT_DIRTY | EXTENT_LOCKED,
1716                                           GFP_NOFS);
1717                         kfree(failure);
1718                 }
1719         }
1720         return 0;
1721 }
1722
1723 /*
1724  * when reads are done, we need to check csums to verify the data is correct
1725  * if there's a match, we allow the bio to finish.  If not, we go through
1726  * the io_failure_record routines to find good copies
1727  */
1728 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1729                                struct extent_state *state)
1730 {
1731         size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
1732         struct inode *inode = page->mapping->host;
1733         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1734         char *kaddr;
1735         u64 private = ~(u32)0;
1736         int ret;
1737         struct btrfs_root *root = BTRFS_I(inode)->root;
1738         u32 csum = ~(u32)0;
1739
1740         if (PageChecked(page)) {
1741                 ClearPageChecked(page);
1742                 goto good;
1743         }
1744         if (btrfs_test_flag(inode, NODATASUM))
1745                 return 0;
1746
1747         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
1748             test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
1749                 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
1750                                   GFP_NOFS);
1751                 return 0;
1752         }
1753
1754         if (state && state->start == start) {
1755                 private = state->private;
1756                 ret = 0;
1757         } else {
1758                 ret = get_state_private(io_tree, start, &private);
1759         }
1760         kaddr = kmap_atomic(page, KM_USER0);
1761         if (ret)
1762                 goto zeroit;
1763
1764         csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1);
1765         btrfs_csum_final(csum, (char *)&csum);
1766         if (csum != private)
1767                 goto zeroit;
1768
1769         kunmap_atomic(kaddr, KM_USER0);
1770 good:
1771         /* if the io failure tree for this inode is non-empty,
1772          * check to see if we've recovered from a failed IO
1773          */
1774         btrfs_clean_io_failures(inode, start);
1775         return 0;
1776
1777 zeroit:
1778         printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
1779                "private %llu\n", page->mapping->host->i_ino,
1780                (unsigned long long)start, csum,
1781                (unsigned long long)private);
1782         memset(kaddr + offset, 1, end - start + 1);
1783         flush_dcache_page(page);
1784         kunmap_atomic(kaddr, KM_USER0);
1785         if (private == 0)
1786                 return 0;
1787         return -EIO;
1788 }
1789
1790 /*
1791  * This creates an orphan entry for the given inode in case something goes
1792  * wrong in the middle of an unlink/truncate.
1793  */
1794 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
1795 {
1796         struct btrfs_root *root = BTRFS_I(inode)->root;
1797         int ret = 0;
1798
1799         spin_lock(&root->list_lock);
1800
1801         /* already on the orphan list, we're good */
1802         if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
1803                 spin_unlock(&root->list_lock);
1804                 return 0;
1805         }
1806
1807         list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1808
1809         spin_unlock(&root->list_lock);
1810
1811         /*
1812          * insert an orphan item to track this unlinked/truncated file
1813          */
1814         ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
1815
1816         return ret;
1817 }
1818
1819 /*
1820  * We have done the truncate/delete so we can go ahead and remove the orphan
1821  * item for this particular inode.
1822  */
1823 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
1824 {
1825         struct btrfs_root *root = BTRFS_I(inode)->root;
1826         int ret = 0;
1827
1828         spin_lock(&root->list_lock);
1829
1830         if (list_empty(&BTRFS_I(inode)->i_orphan)) {
1831                 spin_unlock(&root->list_lock);
1832                 return 0;
1833         }
1834
1835         list_del_init(&BTRFS_I(inode)->i_orphan);
1836         if (!trans) {
1837                 spin_unlock(&root->list_lock);
1838                 return 0;
1839         }
1840
1841         spin_unlock(&root->list_lock);
1842
1843         ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
1844
1845         return ret;
1846 }
1847
1848 /*
1849  * this cleans up any orphans that may be left on the list from the last use
1850  * of this root.
1851  */
1852 void btrfs_orphan_cleanup(struct btrfs_root *root)
1853 {
1854         struct btrfs_path *path;
1855         struct extent_buffer *leaf;
1856         struct btrfs_item *item;
1857         struct btrfs_key key, found_key;
1858         struct btrfs_trans_handle *trans;
1859         struct inode *inode;
1860         int ret = 0, nr_unlink = 0, nr_truncate = 0;
1861
1862         path = btrfs_alloc_path();
1863         if (!path)
1864                 return;
1865         path->reada = -1;
1866
1867         key.objectid = BTRFS_ORPHAN_OBJECTID;
1868         btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1869         key.offset = (u64)-1;
1870
1871
1872         while (1) {
1873                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1874                 if (ret < 0) {
1875                         printk(KERN_ERR "Error searching slot for orphan: %d"
1876                                "\n", ret);
1877                         break;
1878                 }
1879
1880                 /*
1881                  * if ret == 0 means we found what we were searching for, which
1882                  * is weird, but possible, so only screw with path if we didnt
1883                  * find the key and see if we have stuff that matches
1884                  */
1885                 if (ret > 0) {
1886                         if (path->slots[0] == 0)
1887                                 break;
1888                         path->slots[0]--;
1889                 }
1890
1891                 /* pull out the item */
1892                 leaf = path->nodes[0];
1893                 item = btrfs_item_nr(leaf, path->slots[0]);
1894                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1895
1896                 /* make sure the item matches what we want */
1897                 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
1898                         break;
1899                 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
1900                         break;
1901
1902                 /* release the path since we're done with it */
1903                 btrfs_release_path(root, path);
1904
1905                 /*
1906                  * this is where we are basically btrfs_lookup, without the
1907                  * crossing root thing.  we store the inode number in the
1908                  * offset of the orphan item.
1909                  */
1910                 inode = btrfs_iget_locked(root->fs_info->sb,
1911                                           found_key.offset, root);
1912                 if (!inode)
1913                         break;
1914
1915                 if (inode->i_state & I_NEW) {
1916                         BTRFS_I(inode)->root = root;
1917
1918                         /* have to set the location manually */
1919                         BTRFS_I(inode)->location.objectid = inode->i_ino;
1920                         BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
1921                         BTRFS_I(inode)->location.offset = 0;
1922
1923                         btrfs_read_locked_inode(inode);
1924                         unlock_new_inode(inode);
1925                 }
1926
1927                 /*
1928                  * add this inode to the orphan list so btrfs_orphan_del does
1929                  * the proper thing when we hit it
1930                  */
1931                 spin_lock(&root->list_lock);
1932                 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1933                 spin_unlock(&root->list_lock);
1934
1935                 /*
1936                  * if this is a bad inode, means we actually succeeded in
1937                  * removing the inode, but not the orphan record, which means
1938                  * we need to manually delete the orphan since iput will just
1939                  * do a destroy_inode
1940                  */
1941                 if (is_bad_inode(inode)) {
1942                         trans = btrfs_start_transaction(root, 1);
1943                         btrfs_orphan_del(trans, inode);
1944                         btrfs_end_transaction(trans, root);
1945                         iput(inode);
1946                         continue;
1947                 }
1948
1949                 /* if we have links, this was a truncate, lets do that */
1950                 if (inode->i_nlink) {
1951                         nr_truncate++;
1952                         btrfs_truncate(inode);
1953                 } else {
1954                         nr_unlink++;
1955                 }
1956
1957                 /* this will do delete_inode and everything for us */
1958                 iput(inode);
1959         }
1960
1961         if (nr_unlink)
1962                 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
1963         if (nr_truncate)
1964                 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
1965
1966         btrfs_free_path(path);
1967 }
1968
1969 /*
1970  * read an inode from the btree into the in-memory inode
1971  */
1972 void btrfs_read_locked_inode(struct inode *inode)
1973 {
1974         struct btrfs_path *path;
1975         struct extent_buffer *leaf;
1976         struct btrfs_inode_item *inode_item;
1977         struct btrfs_timespec *tspec;
1978         struct btrfs_root *root = BTRFS_I(inode)->root;
1979         struct btrfs_key location;
1980         u64 alloc_group_block;
1981         u32 rdev;
1982         int ret;
1983
1984         path = btrfs_alloc_path();
1985         BUG_ON(!path);
1986         memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
1987
1988         ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
1989         if (ret)
1990                 goto make_bad;
1991
1992         leaf = path->nodes[0];
1993         inode_item = btrfs_item_ptr(leaf, path->slots[0],
1994                                     struct btrfs_inode_item);
1995
1996         inode->i_mode = btrfs_inode_mode(leaf, inode_item);
1997         inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
1998         inode->i_uid = btrfs_inode_uid(leaf, inode_item);
1999         inode->i_gid = btrfs_inode_gid(leaf, inode_item);
2000         btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
2001
2002         tspec = btrfs_inode_atime(inode_item);
2003         inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2004         inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2005
2006         tspec = btrfs_inode_mtime(inode_item);
2007         inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2008         inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2009
2010         tspec = btrfs_inode_ctime(inode_item);
2011         inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2012         inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2013
2014         inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2015         BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2016         BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
2017         inode->i_generation = BTRFS_I(inode)->generation;
2018         inode->i_rdev = 0;
2019         rdev = btrfs_inode_rdev(leaf, inode_item);
2020
2021         BTRFS_I(inode)->index_cnt = (u64)-1;
2022         BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2023
2024         alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2025
2026         BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2027                                                 alloc_group_block, 0);
2028         btrfs_free_path(path);
2029         inode_item = NULL;
2030
2031         switch (inode->i_mode & S_IFMT) {
2032         case S_IFREG:
2033                 inode->i_mapping->a_ops = &btrfs_aops;
2034                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2035                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
2036                 inode->i_fop = &btrfs_file_operations;
2037                 inode->i_op = &btrfs_file_inode_operations;
2038                 break;
2039         case S_IFDIR:
2040                 inode->i_fop = &btrfs_dir_file_operations;
2041                 if (root == root->fs_info->tree_root)
2042                         inode->i_op = &btrfs_dir_ro_inode_operations;
2043                 else
2044                         inode->i_op = &btrfs_dir_inode_operations;
2045                 break;
2046         case S_IFLNK:
2047                 inode->i_op = &btrfs_symlink_inode_operations;
2048                 inode->i_mapping->a_ops = &btrfs_symlink_aops;
2049                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2050                 break;
2051         default:
2052                 inode->i_op = &btrfs_special_inode_operations;
2053                 init_special_inode(inode, inode->i_mode, rdev);
2054                 break;
2055         }
2056         return;
2057
2058 make_bad:
2059         btrfs_free_path(path);
2060         make_bad_inode(inode);
2061 }
2062
2063 /*
2064  * given a leaf and an inode, copy the inode fields into the leaf
2065  */
2066 static void fill_inode_item(struct btrfs_trans_handle *trans,
2067                             struct extent_buffer *leaf,
2068                             struct btrfs_inode_item *item,
2069                             struct inode *inode)
2070 {
2071         btrfs_set_inode_uid(leaf, item, inode->i_uid);
2072         btrfs_set_inode_gid(leaf, item, inode->i_gid);
2073         btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
2074         btrfs_set_inode_mode(leaf, item, inode->i_mode);
2075         btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2076
2077         btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2078                                inode->i_atime.tv_sec);
2079         btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2080                                 inode->i_atime.tv_nsec);
2081
2082         btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2083                                inode->i_mtime.tv_sec);
2084         btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2085                                 inode->i_mtime.tv_nsec);
2086
2087         btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2088                                inode->i_ctime.tv_sec);
2089         btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2090                                 inode->i_ctime.tv_nsec);
2091
2092         btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2093         btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
2094         btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
2095         btrfs_set_inode_transid(leaf, item, trans->transid);
2096         btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2097         btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2098         btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
2099 }
2100
2101 /*
2102  * copy everything in the in-memory inode into the btree.
2103  */
2104 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2105                                 struct btrfs_root *root, struct inode *inode)
2106 {
2107         struct btrfs_inode_item *inode_item;
2108         struct btrfs_path *path;
2109         struct extent_buffer *leaf;
2110         int ret;
2111
2112         path = btrfs_alloc_path();
2113         BUG_ON(!path);
2114         ret = btrfs_lookup_inode(trans, root, path,
2115                                  &BTRFS_I(inode)->location, 1);
2116         if (ret) {
2117                 if (ret > 0)
2118                         ret = -ENOENT;
2119                 goto failed;
2120         }
2121
2122         btrfs_unlock_up_safe(path, 1);
2123         leaf = path->nodes[0];
2124         inode_item = btrfs_item_ptr(leaf, path->slots[0],
2125                                   struct btrfs_inode_item);
2126
2127         fill_inode_item(trans, leaf, inode_item, inode);
2128         btrfs_mark_buffer_dirty(leaf);
2129         btrfs_set_inode_last_trans(trans, inode);
2130         ret = 0;
2131 failed:
2132         btrfs_free_path(path);
2133         return ret;
2134 }
2135
2136
2137 /*
2138  * unlink helper that gets used here in inode.c and in the tree logging
2139  * recovery code.  It remove a link in a directory with a given name, and
2140  * also drops the back refs in the inode to the directory
2141  */
2142 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2143                        struct btrfs_root *root,
2144                        struct inode *dir, struct inode *inode,
2145                        const char *name, int name_len)
2146 {
2147         struct btrfs_path *path;
2148         int ret = 0;
2149         struct extent_buffer *leaf;
2150         struct btrfs_dir_item *di;
2151         struct btrfs_key key;
2152         u64 index;
2153
2154         path = btrfs_alloc_path();
2155         if (!path) {
2156                 ret = -ENOMEM;
2157                 goto err;
2158         }
2159
2160         di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2161                                     name, name_len, -1);
2162         if (IS_ERR(di)) {
2163                 ret = PTR_ERR(di);
2164                 goto err;
2165         }
2166         if (!di) {
2167                 ret = -ENOENT;
2168                 goto err;
2169         }
2170         leaf = path->nodes[0];
2171         btrfs_dir_item_key_to_cpu(leaf, di, &key);
2172         ret = btrfs_delete_one_dir_name(trans, root, path, di);
2173         if (ret)
2174                 goto err;
2175         btrfs_release_path(root, path);
2176
2177         ret = btrfs_del_inode_ref(trans, root, name, name_len,
2178                                   inode->i_ino,
2179                                   dir->i_ino, &index);
2180         if (ret) {
2181                 printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
2182                        "inode %lu parent %lu\n", name_len, name,
2183                        inode->i_ino, dir->i_ino);
2184                 goto err;
2185         }
2186
2187         di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
2188                                          index, name, name_len, -1);
2189         if (IS_ERR(di)) {
2190                 ret = PTR_ERR(di);
2191                 goto err;
2192         }
2193         if (!di) {
2194                 ret = -ENOENT;
2195                 goto err;
2196         }
2197         ret = btrfs_delete_one_dir_name(trans, root, path, di);
2198         btrfs_release_path(root, path);
2199
2200         ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2201                                          inode, dir->i_ino);
2202         BUG_ON(ret != 0 && ret != -ENOENT);
2203         if (ret != -ENOENT)
2204                 BTRFS_I(dir)->log_dirty_trans = trans->transid;
2205
2206         ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2207                                            dir, index);
2208         BUG_ON(ret);
2209 err:
2210         btrfs_free_path(path);
2211         if (ret)
2212                 goto out;
2213
2214         btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2215         inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2216         btrfs_update_inode(trans, root, dir);
2217         btrfs_drop_nlink(inode);
2218         ret = btrfs_update_inode(trans, root, inode);
2219         dir->i_sb->s_dirt = 1;
2220 out:
2221         return ret;
2222 }
2223
2224 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2225 {
2226         struct btrfs_root *root;
2227         struct btrfs_trans_handle *trans;
2228         struct inode *inode = dentry->d_inode;
2229         int ret;
2230         unsigned long nr = 0;
2231
2232         root = BTRFS_I(dir)->root;
2233
2234         ret = btrfs_check_free_space(root, 1, 1);
2235         if (ret)
2236                 goto fail;
2237
2238         trans = btrfs_start_transaction(root, 1);
2239
2240         btrfs_set_trans_block_group(trans, dir);
2241         ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2242                                  dentry->d_name.name, dentry->d_name.len);
2243
2244         if (inode->i_nlink == 0)
2245                 ret = btrfs_orphan_add(trans, inode);
2246
2247         nr = trans->blocks_used;
2248
2249         btrfs_end_transaction_throttle(trans, root);
2250 fail:
2251         btrfs_btree_balance_dirty(root, nr);
2252         return ret;
2253 }
2254
2255 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2256 {
2257         struct inode *inode = dentry->d_inode;
2258         int err = 0;
2259         int ret;
2260         struct btrfs_root *root = BTRFS_I(dir)->root;
2261         struct btrfs_trans_handle *trans;
2262         unsigned long nr = 0;
2263
2264         /*
2265          * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
2266          * the root of a subvolume or snapshot
2267          */
2268         if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
2269             inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
2270                 return -ENOTEMPTY;
2271         }
2272
2273         ret = btrfs_check_free_space(root, 1, 1);
2274         if (ret)
2275                 goto fail;
2276
2277         trans = btrfs_start_transaction(root, 1);
2278         btrfs_set_trans_block_group(trans, dir);
2279
2280         err = btrfs_orphan_add(trans, inode);
2281         if (err)
2282                 goto fail_trans;
2283
2284         /* now the directory is empty */
2285         err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2286                                  dentry->d_name.name, dentry->d_name.len);
2287         if (!err)
2288                 btrfs_i_size_write(inode, 0);
2289
2290 fail_trans:
2291         nr = trans->blocks_used;
2292         ret = btrfs_end_transaction_throttle(trans, root);
2293 fail:
2294         btrfs_btree_balance_dirty(root, nr);
2295
2296         if (ret && !err)
2297                 err = ret;
2298         return err;
2299 }
2300
2301 #if 0
2302 /*
2303  * when truncating bytes in a file, it is possible to avoid reading
2304  * the leaves that contain only checksum items.  This can be the
2305  * majority of the IO required to delete a large file, but it must
2306  * be done carefully.
2307  *
2308  * The keys in the level just above the leaves are checked to make sure
2309  * the lowest key in a given leaf is a csum key, and starts at an offset
2310  * after the new  size.
2311  *
2312  * Then the key for the next leaf is checked to make sure it also has
2313  * a checksum item for the same file.  If it does, we know our target leaf
2314  * contains only checksum items, and it can be safely freed without reading
2315  * it.
2316  *
2317  * This is just an optimization targeted at large files.  It may do
2318  * nothing.  It will return 0 unless things went badly.
2319  */
2320 static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
2321                                      struct btrfs_root *root,
2322                                      struct btrfs_path *path,
2323                                      struct inode *inode, u64 new_size)
2324 {
2325         struct btrfs_key key;
2326         int ret;
2327         int nritems;
2328         struct btrfs_key found_key;
2329         struct btrfs_key other_key;
2330         struct btrfs_leaf_ref *ref;
2331         u64 leaf_gen;
2332         u64 leaf_start;
2333
2334         path->lowest_level = 1;
2335         key.objectid = inode->i_ino;
2336         key.type = BTRFS_CSUM_ITEM_KEY;
2337         key.offset = new_size;
2338 again:
2339         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2340         if (ret < 0)
2341                 goto out;
2342
2343         if (path->nodes[1] == NULL) {
2344                 ret = 0;
2345                 goto out;
2346         }
2347         ret = 0;
2348         btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
2349         nritems = btrfs_header_nritems(path->nodes[1]);
2350
2351         if (!nritems)
2352                 goto out;
2353
2354         if (path->slots[1] >= nritems)
2355                 goto next_node;
2356
2357         /* did we find a key greater than anything we want to delete? */
2358         if (found_key.objectid > inode->i_ino ||
2359            (found_key.objectid == inode->i_ino && found_key.type > key.type))
2360                 goto out;
2361
2362         /* we check the next key in the node to make sure the leave contains
2363          * only checksum items.  This comparison doesn't work if our
2364          * leaf is the last one in the node
2365          */
2366         if (path->slots[1] + 1 >= nritems) {
2367 next_node:
2368                 /* search forward from the last key in the node, this
2369                  * will bring us into the next node in the tree
2370                  */
2371                 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
2372
2373                 /* unlikely, but we inc below, so check to be safe */
2374                 if (found_key.offset == (u64)-1)
2375                         goto out;
2376
2377                 /* search_forward needs a path with locks held, do the
2378                  * search again for the original key.  It is possible
2379                  * this will race with a balance and return a path that
2380                  * we could modify, but this drop is just an optimization
2381                  * and is allowed to miss some leaves.
2382                  */
2383                 btrfs_release_path(root, path);
2384                 found_key.offset++;
2385
2386                 /* setup a max key for search_forward */
2387                 other_key.offset = (u64)-1;
2388                 other_key.type = key.type;
2389                 other_key.objectid = key.objectid;
2390
2391                 path->keep_locks = 1;
2392                 ret = btrfs_search_forward(root, &found_key, &other_key,
2393                                            path, 0, 0);
2394                 path->keep_locks = 0;
2395                 if (ret || found_key.objectid != key.objectid ||
2396                     found_key.type != key.type) {
2397                         ret = 0;
2398                         goto out;
2399                 }
2400
2401                 key.offset = found_key.offset;
2402                 btrfs_release_path(root, path);
2403                 cond_resched();
2404                 goto again;
2405         }
2406
2407         /* we know there's one more slot after us in the tree,
2408          * read that key so we can verify it is also a checksum item
2409          */
2410         btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
2411
2412         if (found_key.objectid < inode->i_ino)
2413                 goto next_key;
2414
2415         if (found_key.type != key.type || found_key.offset < new_size)
2416                 goto next_key;
2417
2418         /*
2419          * if the key for the next leaf isn't a csum key from this objectid,
2420          * we can't be sure there aren't good items inside this leaf.
2421          * Bail out
2422          */
2423         if (other_key.objectid != inode->i_ino || other_key.type != key.type)
2424                 goto out;
2425
2426         leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
2427         leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
2428         /*
2429          * it is safe to delete this leaf, it contains only
2430          * csum items from this inode at an offset >= new_size
2431          */
2432         ret = btrfs_del_leaf(trans, root, path, leaf_start);
2433         BUG_ON(ret);
2434
2435         if (root->ref_cows && leaf_gen < trans->transid) {
2436                 ref = btrfs_alloc_leaf_ref(root, 0);
2437                 if (ref) {
2438                         ref->root_gen = root->root_key.offset;
2439                         ref->bytenr = leaf_start;
2440                         ref->owner = 0;
2441                         ref->generation = leaf_gen;
2442                         ref->nritems = 0;
2443
2444                         btrfs_sort_leaf_ref(ref);
2445
2446                         ret = btrfs_add_leaf_ref(root, ref, 0);
2447                         WARN_ON(ret);
2448                         btrfs_free_leaf_ref(root, ref);
2449                 } else {
2450                         WARN_ON(1);
2451                 }
2452         }
2453 next_key:
2454         btrfs_release_path(root, path);
2455
2456         if (other_key.objectid == inode->i_ino &&
2457             other_key.type == key.type && other_key.offset > key.offset) {
2458                 key.offset = other_key.offset;
2459                 cond_resched();
2460                 goto again;
2461         }
2462         ret = 0;
2463 out:
2464         /* fixup any changes we've made to the path */
2465         path->lowest_level = 0;
2466         path->keep_locks = 0;
2467         btrfs_release_path(root, path);
2468         return ret;
2469 }
2470
2471 #endif
2472
2473 /*
2474  * this can truncate away extent items, csum items and directory items.
2475  * It starts at a high offset and removes keys until it can't find
2476  * any higher than new_size
2477  *
2478  * csum items that cross the new i_size are truncated to the new size
2479  * as well.
2480  *
2481  * min_type is the minimum key type to truncate down to.  If set to 0, this
2482  * will kill all the items on this inode, including the INODE_ITEM_KEY.
2483  */
2484 noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2485                                         struct btrfs_root *root,
2486                                         struct inode *inode,
2487                                         u64 new_size, u32 min_type)
2488 {
2489         int ret;
2490         struct btrfs_path *path;
2491         struct btrfs_key key;
2492         struct btrfs_key found_key;
2493         u32 found_type;
2494         struct extent_buffer *leaf;
2495         struct btrfs_file_extent_item *fi;
2496         u64 extent_start = 0;
2497         u64 extent_num_bytes = 0;
2498         u64 item_end = 0;
2499         u64 root_gen = 0;
2500         u64 root_owner = 0;
2501         int found_extent;
2502         int del_item;
2503         int pending_del_nr = 0;
2504         int pending_del_slot = 0;
2505         int extent_type = -1;
2506         int encoding;
2507         u64 mask = root->sectorsize - 1;
2508
2509         if (root->ref_cows)
2510                 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
2511         path = btrfs_alloc_path();
2512         path->reada = -1;
2513         BUG_ON(!path);
2514
2515         /* FIXME, add redo link to tree so we don't leak on crash */
2516         key.objectid = inode->i_ino;
2517         key.offset = (u64)-1;
2518         key.type = (u8)-1;
2519
2520         btrfs_init_path(path);
2521
2522 search_again:
2523         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2524         if (ret < 0)
2525                 goto error;
2526
2527         if (ret > 0) {
2528                 /* there are no items in the tree for us to truncate, we're
2529                  * done
2530                  */
2531                 if (path->slots[0] == 0) {
2532                         ret = 0;
2533                         goto error;
2534                 }
2535                 path->slots[0]--;
2536         }
2537
2538         while (1) {
2539                 fi = NULL;
2540                 leaf = path->nodes[0];
2541                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2542                 found_type = btrfs_key_type(&found_key);
2543                 encoding = 0;
2544
2545                 if (found_key.objectid != inode->i_ino)
2546                         break;
2547
2548                 if (found_type < min_type)
2549                         break;
2550
2551                 item_end = found_key.offset;
2552                 if (found_type == BTRFS_EXTENT_DATA_KEY) {
2553                         fi = btrfs_item_ptr(leaf, path->slots[0],
2554                                             struct btrfs_file_extent_item);
2555                         extent_type = btrfs_file_extent_type(leaf, fi);
2556                         encoding = btrfs_file_extent_compression(leaf, fi);
2557                         encoding |= btrfs_file_extent_encryption(leaf, fi);
2558                         encoding |= btrfs_file_extent_other_encoding(leaf, fi);
2559
2560                         if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2561                                 item_end +=
2562                                     btrfs_file_extent_num_bytes(leaf, fi);
2563                         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2564                                 item_end += btrfs_file_extent_inline_len(leaf,
2565                                                                          fi);
2566                         }
2567                         item_end--;
2568                 }
2569                 if (item_end < new_size) {
2570                         if (found_type == BTRFS_DIR_ITEM_KEY)
2571                                 found_type = BTRFS_INODE_ITEM_KEY;
2572                         else if (found_type == BTRFS_EXTENT_ITEM_KEY)
2573                                 found_type = BTRFS_EXTENT_DATA_KEY;
2574                         else if (found_type == BTRFS_EXTENT_DATA_KEY)
2575                                 found_type = BTRFS_XATTR_ITEM_KEY;
2576                         else if (found_type == BTRFS_XATTR_ITEM_KEY)
2577                                 found_type = BTRFS_INODE_REF_KEY;
2578                         else if (found_type)
2579                                 found_type--;
2580                         else
2581                                 break;
2582                         btrfs_set_key_type(&key, found_type);
2583                         goto next;
2584                 }
2585                 if (found_key.offset >= new_size)
2586                         del_item = 1;
2587                 else
2588                         del_item = 0;
2589                 found_extent = 0;
2590
2591                 /* FIXME, shrink the extent if the ref count is only 1 */
2592                 if (found_type != BTRFS_EXTENT_DATA_KEY)
2593                         goto delete;
2594
2595                 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2596                         u64 num_dec;
2597                         extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
2598                         if (!del_item && !encoding) {
2599                                 u64 orig_num_bytes =
2600                                         btrfs_file_extent_num_bytes(leaf, fi);
2601                                 extent_num_bytes = new_size -
2602                                         found_key.offset + root->sectorsize - 1;
2603                                 extent_num_bytes = extent_num_bytes &
2604                                         ~((u64)root->sectorsize - 1);
2605                                 btrfs_set_file_extent_num_bytes(leaf, fi,
2606                                                          extent_num_bytes);
2607                                 num_dec = (orig_num_bytes -
2608                                            extent_num_bytes);
2609                                 if (root->ref_cows && extent_start != 0)
2610                                         inode_sub_bytes(inode, num_dec);
2611                                 btrfs_mark_buffer_dirty(leaf);
2612                         } else {
2613                                 extent_num_bytes =
2614                                         btrfs_file_extent_disk_num_bytes(leaf,
2615                                                                          fi);
2616                                 /* FIXME blocksize != 4096 */
2617                                 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
2618                                 if (extent_start != 0) {
2619                                         found_extent = 1;
2620                                         if (root->ref_cows)
2621                                                 inode_sub_bytes(inode, num_dec);
2622                                 }
2623                                 root_gen = btrfs_header_generation(leaf);
2624                                 root_owner = btrfs_header_owner(leaf);
2625                         }
2626                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2627                         /*
2628                          * we can't truncate inline items that have had
2629                          * special encodings
2630                          */
2631                         if (!del_item &&
2632                             btrfs_file_extent_compression(leaf, fi) == 0 &&
2633                             btrfs_file_extent_encryption(leaf, fi) == 0 &&
2634                             btrfs_file_extent_other_encoding(leaf, fi) == 0) {
2635                                 u32 size = new_size - found_key.offset;
2636
2637                                 if (root->ref_cows) {
2638                                         inode_sub_bytes(inode, item_end + 1 -
2639                                                         new_size);
2640                                 }
2641                                 size =
2642                                     btrfs_file_extent_calc_inline_size(size);
2643                                 ret = btrfs_truncate_item(trans, root, path,
2644                                                           size, 1);
2645                                 BUG_ON(ret);
2646                         } else if (root->ref_cows) {
2647                                 inode_sub_bytes(inode, item_end + 1 -
2648                                                 found_key.offset);
2649                         }
2650                 }
2651 delete:
2652                 if (del_item) {
2653                         if (!pending_del_nr) {
2654                                 /* no pending yet, add ourselves */
2655                                 pending_del_slot = path->slots[0];
2656                                 pending_del_nr = 1;
2657                         } else if (pending_del_nr &&
2658                                    path->slots[0] + 1 == pending_del_slot) {
2659                                 /* hop on the pending chunk */
2660                                 pending_del_nr++;
2661                                 pending_del_slot = path->slots[0];
2662                         } else {
2663                                 BUG();
2664                         }
2665                 } else {
2666                         break;
2667                 }
2668                 if (found_extent) {
2669                         ret = btrfs_free_extent(trans, root, extent_start,
2670                                                 extent_num_bytes,
2671                                                 leaf->start, root_owner,
2672                                                 root_gen, inode->i_ino, 0);
2673                         BUG_ON(ret);
2674                 }
2675 next:
2676                 if (path->slots[0] == 0) {
2677                         if (pending_del_nr)
2678                                 goto del_pending;
2679                         btrfs_release_path(root, path);
2680                         goto search_again;
2681                 }
2682
2683                 path->slots[0]--;
2684                 if (pending_del_nr &&
2685                     path->slots[0] + 1 != pending_del_slot) {
2686                         struct btrfs_key debug;
2687 del_pending:
2688                         btrfs_item_key_to_cpu(path->nodes[0], &debug,
2689                                               pending_del_slot);
2690                         ret = btrfs_del_items(trans, root, path,
2691                                               pending_del_slot,
2692                                               pending_del_nr);
2693                         BUG_ON(ret);
2694                         pending_del_nr = 0;
2695                         btrfs_release_path(root, path);
2696                         goto search_again;
2697                 }
2698         }
2699         ret = 0;
2700 error:
2701         if (pending_del_nr) {
2702                 ret = btrfs_del_items(trans, root, path, pending_del_slot,
2703                                       pending_del_nr);
2704         }
2705         btrfs_free_path(path);
2706         inode->i_sb->s_dirt = 1;
2707         return ret;
2708 }
2709
2710 /*
2711  * taken from block_truncate_page, but does cow as it zeros out
2712  * any bytes left in the last page in the file.
2713  */
2714 static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
2715 {
2716         struct inode *inode = mapping->host;
2717         struct btrfs_root *root = BTRFS_I(inode)->root;
2718         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2719         struct btrfs_ordered_extent *ordered;
2720         char *kaddr;
2721         u32 blocksize = root->sectorsize;
2722         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2723         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2724         struct page *page;
2725         int ret = 0;
2726         u64 page_start;
2727         u64 page_end;
2728
2729         if ((offset & (blocksize - 1)) == 0)
2730                 goto out;
2731
2732         ret = -ENOMEM;
2733 again:
2734         page = grab_cache_page(mapping, index);
2735         if (!page)
2736                 goto out;
2737
2738         page_start = page_offset(page);
2739         page_end = page_start + PAGE_CACHE_SIZE - 1;
2740
2741         if (!PageUptodate(page)) {
2742                 ret = btrfs_readpage(NULL, page);
2743                 lock_page(page);
2744                 if (page->mapping != mapping) {
2745                         unlock_page(page);
2746                         page_cache_release(page);
2747                         goto again;
2748                 }
2749                 if (!PageUptodate(page)) {
2750                         ret = -EIO;
2751                         goto out_unlock;
2752                 }
2753         }
2754         wait_on_page_writeback(page);
2755
2756         lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2757         set_page_extent_mapped(page);
2758
2759         ordered = btrfs_lookup_ordered_extent(inode, page_start);
2760         if (ordered) {
2761                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2762                 unlock_page(page);
2763                 page_cache_release(page);
2764                 btrfs_start_ordered_extent(inode, ordered, 1);
2765                 btrfs_put_ordered_extent(ordered);
2766                 goto again;
2767         }
2768
2769         btrfs_set_extent_delalloc(inode, page_start, page_end);
2770         ret = 0;
2771         if (offset != PAGE_CACHE_SIZE) {
2772                 kaddr = kmap(page);
2773                 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2774                 flush_dcache_page(page);
2775                 kunmap(page);
2776         }
2777         ClearPageChecked(page);
2778         set_page_dirty(page);
2779         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2780
2781 out_unlock:
2782         unlock_page(page);
2783         page_cache_release(page);
2784 out:
2785         return ret;
2786 }
2787
2788 int btrfs_cont_expand(struct inode *inode, loff_t size)
2789 {
2790         struct btrfs_trans_handle *trans;
2791         struct btrfs_root *root = BTRFS_I(inode)->root;
2792         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2793         struct extent_map *em;
2794         u64 mask = root->sectorsize - 1;
2795         u64 hole_start = (inode->i_size + mask) & ~mask;
2796         u64 block_end = (size + mask) & ~mask;
2797         u64 last_byte;
2798         u64 cur_offset;
2799         u64 hole_size;
2800         int err;
2801
2802         if (size <= hole_start)
2803                 return 0;
2804
2805         err = btrfs_check_free_space(root, 1, 0);
2806         if (err)
2807                 return err;
2808
2809         btrfs_truncate_page(inode->i_mapping, inode->i_size);
2810
2811         while (1) {
2812                 struct btrfs_ordered_extent *ordered;
2813                 btrfs_wait_ordered_range(inode, hole_start,
2814                                          block_end - hole_start);
2815                 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2816                 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
2817                 if (!ordered)
2818                         break;
2819                 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2820                 btrfs_put_ordered_extent(ordered);
2821         }
2822
2823         trans = btrfs_start_transaction(root, 1);
2824         btrfs_set_trans_block_group(trans, inode);
2825
2826         cur_offset = hole_start;
2827         while (1) {
2828                 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2829                                 block_end - cur_offset, 0);
2830                 BUG_ON(IS_ERR(em) || !em);
2831                 last_byte = min(extent_map_end(em), block_end);
2832                 last_byte = (last_byte + mask) & ~mask;
2833                 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
2834                         u64 hint_byte = 0;
2835                         hole_size = last_byte - cur_offset;
2836                         err = btrfs_drop_extents(trans, root, inode,
2837                                                  cur_offset,
2838                                                  cur_offset + hole_size,
2839                                                  cur_offset, &hint_byte);
2840                         if (err)
2841                                 break;
2842                         err = btrfs_insert_file_extent(trans, root,
2843                                         inode->i_ino, cur_offset, 0,
2844                                         0, hole_size, 0, hole_size,
2845                                         0, 0, 0);
2846                         btrfs_drop_extent_cache(inode, hole_start,
2847                                         last_byte - 1, 0);
2848                 }
2849                 free_extent_map(em);
2850                 cur_offset = last_byte;
2851                 if (err || cur_offset >= block_end)
2852                         break;
2853         }
2854
2855         btrfs_end_transaction(trans, root);
2856         unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2857         return err;
2858 }
2859
2860 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
2861 {
2862         struct inode *inode = dentry->d_inode;
2863         int err;
2864
2865         err = inode_change_ok(inode, attr);
2866         if (err)
2867                 return err;
2868
2869         if (S_ISREG(inode->i_mode) &&
2870             attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
2871                 err = btrfs_cont_expand(inode, attr->ia_size);
2872                 if (err)
2873                         return err;
2874         }
2875
2876         err = inode_setattr(inode, attr);
2877
2878         if (!err && ((attr->ia_valid & ATTR_MODE)))
2879                 err = btrfs_acl_chmod(inode);
2880         return err;
2881 }
2882
2883 void btrfs_delete_inode(struct inode *inode)
2884 {
2885         struct btrfs_trans_handle *trans;
2886         struct btrfs_root *root = BTRFS_I(inode)->root;
2887         unsigned long nr;
2888         int ret;
2889
2890         truncate_inode_pages(&inode->i_data, 0);
2891         if (is_bad_inode(inode)) {
2892                 btrfs_orphan_del(NULL, inode);
2893                 goto no_delete;
2894         }
2895         btrfs_wait_ordered_range(inode, 0, (u64)-1);
2896
2897         btrfs_i_size_write(inode, 0);
2898         trans = btrfs_join_transaction(root, 1);
2899
2900         btrfs_set_trans_block_group(trans, inode);
2901         ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
2902         if (ret) {
2903                 btrfs_orphan_del(NULL, inode);
2904                 goto no_delete_lock;
2905         }
2906
2907         btrfs_orphan_del(trans, inode);
2908
2909         nr = trans->blocks_used;
2910         clear_inode(inode);
2911
2912         btrfs_end_transaction(trans, root);
2913         btrfs_btree_balance_dirty(root, nr);
2914         return;
2915
2916 no_delete_lock:
2917         nr = trans->blocks_used;
2918         btrfs_end_transaction(trans, root);
2919         btrfs_btree_balance_dirty(root, nr);
2920 no_delete:
2921         clear_inode(inode);
2922 }
2923
2924 /*
2925  * this returns the key found in the dir entry in the location pointer.
2926  * If no dir entries were found, location->objectid is 0.
2927  */
2928 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
2929                                struct btrfs_key *location)
2930 {
2931         const char *name = dentry->d_name.name;
2932         int namelen = dentry->d_name.len;
2933         struct btrfs_dir_item *di;
2934         struct btrfs_path *path;
2935         struct btrfs_root *root = BTRFS_I(dir)->root;
2936         int ret = 0;
2937
2938         path = btrfs_alloc_path();
2939         BUG_ON(!path);
2940
2941         di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
2942                                     namelen, 0);
2943         if (IS_ERR(di))
2944                 ret = PTR_ERR(di);
2945
2946         if (!di || IS_ERR(di))
2947                 goto out_err;
2948
2949         btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
2950 out:
2951         btrfs_free_path(path);
2952         return ret;
2953 out_err:
2954         location->objectid = 0;
2955         goto out;
2956 }
2957
2958 /*
2959  * when we hit a tree root in a directory, the btrfs part of the inode
2960  * needs to be changed to reflect the root directory of the tree root.  This
2961  * is kind of like crossing a mount point.
2962  */
2963 static int fixup_tree_root_location(struct btrfs_root *root,
2964                              struct btrfs_key *location,
2965                              struct btrfs_root **sub_root,
2966                              struct dentry *dentry)
2967 {
2968         struct btrfs_root_item *ri;
2969
2970         if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
2971                 return 0;
2972         if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
2973                 return 0;
2974
2975         *sub_root = btrfs_read_fs_root(root->fs_info, location,
2976                                         dentry->d_name.name,
2977                                         dentry->d_name.len);
2978         if (IS_ERR(*sub_root))
2979                 return PTR_ERR(*sub_root);
2980
2981         ri = &(*sub_root)->root_item;
2982         location->objectid = btrfs_root_dirid(ri);
2983         btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
2984         location->offset = 0;
2985
2986         return 0;
2987 }
2988
2989 static noinline void init_btrfs_i(struct inode *inode)
2990 {
2991         struct btrfs_inode *bi = BTRFS_I(inode);
2992
2993         bi->i_acl = NULL;
2994         bi->i_default_acl = NULL;
2995
2996         bi->generation = 0;
2997         bi->sequence = 0;
2998         bi->last_trans = 0;
2999         bi->logged_trans = 0;
3000         bi->delalloc_bytes = 0;
3001         bi->disk_i_size = 0;
3002         bi->flags = 0;
3003         bi->index_cnt = (u64)-1;
3004         bi->log_dirty_trans = 0;
3005         extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3006         extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3007                              inode->i_mapping, GFP_NOFS);
3008         extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
3009                              inode->i_mapping, GFP_NOFS);
3010         INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3011         btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3012         mutex_init(&BTRFS_I(inode)->extent_mutex);
3013         mutex_init(&BTRFS_I(inode)->log_mutex);
3014 }
3015
3016 static int btrfs_init_locked_inode(struct inode *inode, void *p)
3017 {
3018         struct btrfs_iget_args *args = p;
3019         inode->i_ino = args->ino;
3020         init_btrfs_i(inode);
3021         BTRFS_I(inode)->root = args->root;
3022         return 0;
3023 }
3024
3025 static int btrfs_find_actor(struct inode *inode, void *opaque)
3026 {
3027         struct btrfs_iget_args *args = opaque;
3028         return args->ino == inode->i_ino &&
3029                 args->root == BTRFS_I(inode)->root;
3030 }
3031
3032 struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
3033                             struct btrfs_root *root, int wait)
3034 {
3035         struct inode *inode;
3036         struct btrfs_iget_args args;
3037         args.ino = objectid;
3038         args.root = root;
3039
3040         if (wait) {
3041                 inode = ilookup5(s, objectid, btrfs_find_actor,
3042                                  (void *)&args);
3043         } else {
3044                 inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
3045                                         (void *)&args);
3046         }
3047         return inode;
3048 }
3049
3050 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
3051                                 struct btrfs_root *root)
3052 {
3053         struct inode *inode;
3054         struct btrfs_iget_args args;
3055         args.ino = objectid;
3056         args.root = root;
3057
3058         inode = iget5_locked(s, objectid, btrfs_find_actor,
3059                              btrfs_init_locked_inode,
3060                              (void *)&args);
3061         return inode;
3062 }
3063
3064 /* Get an inode object given its location and corresponding root.
3065  * Returns in *is_new if the inode was read from disk
3066  */
3067 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3068                          struct btrfs_root *root, int *is_new)
3069 {
3070         struct inode *inode;
3071
3072         inode = btrfs_iget_locked(s, location->objectid, root);
3073         if (!inode)
3074                 return ERR_PTR(-EACCES);
3075
3076         if (inode->i_state & I_NEW) {
3077                 BTRFS_I(inode)->root = root;
3078                 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
3079                 btrfs_read_locked_inode(inode);
3080                 unlock_new_inode(inode);
3081                 if (is_new)
3082                         *is_new = 1;
3083         } else {
3084                 if (is_new)
3085                         *is_new = 0;
3086         }
3087
3088         return inode;
3089 }
3090
3091 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3092 {
3093         struct inode *inode;
3094         struct btrfs_inode *bi = BTRFS_I(dir);
3095         struct btrfs_root *root = bi->root;
3096         struct btrfs_root *sub_root = root;
3097         struct btrfs_key location;
3098         int ret, new;
3099
3100         if (dentry->d_name.len > BTRFS_NAME_LEN)
3101                 return ERR_PTR(-ENAMETOOLONG);
3102
3103         ret = btrfs_inode_by_name(dir, dentry, &location);
3104
3105         if (ret < 0)
3106                 return ERR_PTR(ret);
3107
3108         inode = NULL;
3109         if (location.objectid) {
3110                 ret = fixup_tree_root_location(root, &location, &sub_root,
3111                                                 dentry);
3112                 if (ret < 0)
3113                         return ERR_PTR(ret);
3114                 if (ret > 0)
3115                         return ERR_PTR(-ENOENT);
3116                 inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
3117                 if (IS_ERR(inode))
3118                         return ERR_CAST(inode);
3119         }
3120         return inode;
3121 }
3122
3123 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
3124                                    struct nameidata *nd)
3125 {
3126         struct inode *inode;
3127
3128         if (dentry->d_name.len > BTRFS_NAME_LEN)
3129                 return ERR_PTR(-ENAMETOOLONG);
3130
3131         inode = btrfs_lookup_dentry(dir, dentry);
3132         if (IS_ERR(inode))
3133                 return ERR_CAST(inode);
3134
3135         return d_splice_alias(inode, dentry);
3136 }
3137
3138 static unsigned char btrfs_filetype_table[] = {
3139         DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
3140 };
3141
3142 static int btrfs_real_readdir(struct file *filp, void *dirent,
3143                               filldir_t filldir)
3144 {
3145         struct inode *inode = filp->f_dentry->d_inode;
3146         struct btrfs_root *root = BTRFS_I(inode)->root;
3147         struct btrfs_item *item;
3148         struct btrfs_dir_item *di;
3149         struct btrfs_key key;
3150         struct btrfs_key found_key;
3151         struct btrfs_path *path;
3152         int ret;
3153         u32 nritems;
3154         struct extent_buffer *leaf;
3155         int slot;
3156         int advance;
3157         unsigned char d_type;
3158         int over = 0;
3159         u32 di_cur;
3160         u32 di_total;
3161         u32 di_len;
3162         int key_type = BTRFS_DIR_INDEX_KEY;
3163         char tmp_name[32];
3164         char *name_ptr;
3165         int name_len;
3166
3167         /* FIXME, use a real flag for deciding about the key type */
3168         if (root->fs_info->tree_root == root)
3169                 key_type = BTRFS_DIR_ITEM_KEY;
3170
3171         /* special case for "." */
3172         if (filp->f_pos == 0) {
3173                 over = filldir(dirent, ".", 1,
3174                                1, inode->i_ino,
3175                                DT_DIR);
3176                 if (over)
3177                         return 0;
3178                 filp->f_pos = 1;
3179         }
3180         /* special case for .., just use the back ref */
3181         if (filp->f_pos == 1) {
3182                 u64 pino = parent_ino(filp->f_path.dentry);
3183                 over = filldir(dirent, "..", 2,
3184                                2, pino, DT_DIR);
3185                 if (over)
3186                         return 0;
3187                 filp->f_pos = 2;
3188         }
3189         path = btrfs_alloc_path();
3190         path->reada = 2;
3191
3192         btrfs_set_key_type(&key, key_type);
3193         key.offset = filp->f_pos;
3194         key.objectid = inode->i_ino;
3195
3196         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3197         if (ret < 0)
3198                 goto err;
3199         advance = 0;
3200
3201         while (1) {
3202                 leaf = path->nodes[0];
3203                 nritems = btrfs_header_nritems(leaf);
3204                 slot = path->slots[0];
3205                 if (advance || slot >= nritems) {
3206                         if (slot >= nritems - 1) {
3207                                 ret = btrfs_next_leaf(root, path);
3208                                 if (ret)
3209                                         break;
3210                                 leaf = path->nodes[0];
3211                                 nritems = btrfs_header_nritems(leaf);
3212                                 slot = path->slots[0];
3213                         } else {
3214                                 slot++;
3215                                 path->slots[0]++;
3216                         }
3217                 }
3218
3219                 advance = 1;
3220                 item = btrfs_item_nr(leaf, slot);
3221                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3222
3223                 if (found_key.objectid != key.objectid)
3224                         break;
3225                 if (btrfs_key_type(&found_key) != key_type)
3226                         break;
3227                 if (found_key.offset < filp->f_pos)
3228                         continue;
3229
3230                 filp->f_pos = found_key.offset;
3231
3232                 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
3233                 di_cur = 0;
3234                 di_total = btrfs_item_size(leaf, item);
3235
3236                 while (di_cur < di_total) {
3237                         struct btrfs_key location;
3238
3239                         name_len = btrfs_dir_name_len(leaf, di);
3240                         if (name_len <= sizeof(tmp_name)) {
3241                                 name_ptr = tmp_name;
3242                         } else {
3243                                 name_ptr = kmalloc(name_len, GFP_NOFS);
3244                                 if (!name_ptr) {
3245                                         ret = -ENOMEM;
3246                                         goto err;
3247                                 }
3248                         }
3249                         read_extent_buffer(leaf, name_ptr,
3250                                            (unsigned long)(di + 1), name_len);
3251
3252                         d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
3253                         btrfs_dir_item_key_to_cpu(leaf, di, &location);
3254
3255                         /* is this a reference to our own snapshot? If so
3256                          * skip it
3257                          */
3258                         if (location.type == BTRFS_ROOT_ITEM_KEY &&
3259                             location.objectid == root->root_key.objectid) {
3260                                 over = 0;
3261                                 goto skip;
3262                         }
3263                         over = filldir(dirent, name_ptr, name_len,
3264                                        found_key.offset, location.objectid,
3265                                        d_type);
3266
3267 skip:
3268                         if (name_ptr != tmp_name)
3269                                 kfree(name_ptr);
3270
3271                         if (over)
3272                                 goto nopos;
3273                         di_len = btrfs_dir_name_len(leaf, di) +
3274                                  btrfs_dir_data_len(leaf, di) + sizeof(*di);
3275                         di_cur += di_len;
3276                         di = (struct btrfs_dir_item *)((char *)di + di_len);
3277                 }
3278         }
3279
3280         /* Reached end of directory/root. Bump pos past the last item. */
3281         if (key_type == BTRFS_DIR_INDEX_KEY)
3282                 filp->f_pos = INT_LIMIT(off_t);
3283         else
3284                 filp->f_pos++;
3285 nopos:
3286         ret = 0;
3287 err:
3288         btrfs_free_path(path);
3289         return ret;
3290 }
3291
3292 int btrfs_write_inode(struct inode *inode, int wait)
3293 {
3294         struct btrfs_root *root = BTRFS_I(inode)->root;
3295         struct btrfs_trans_handle *trans;
3296         int ret = 0;
3297
3298         if (root->fs_info->btree_inode == inode)
3299                 return 0;
3300
3301         if (wait) {
3302                 trans = btrfs_join_transaction(root, 1);
3303                 btrfs_set_trans_block_group(trans, inode);
3304                 ret = btrfs_commit_transaction(trans, root);
3305         }
3306         return ret;
3307 }
3308
3309 /*
3310  * This is somewhat expensive, updating the tree every time the
3311  * inode changes.  But, it is most likely to find the inode in cache.
3312  * FIXME, needs more benchmarking...there are no reasons other than performance
3313  * to keep or drop this code.
3314  */
3315 void btrfs_dirty_inode(struct inode *inode)
3316 {
3317         struct btrfs_root *root = BTRFS_I(inode)->root;
3318         struct btrfs_trans_handle *trans;
3319
3320         trans = btrfs_join_transaction(root, 1);
3321         btrfs_set_trans_block_group(trans, inode);
3322         btrfs_update_inode(trans, root, inode);
3323         btrfs_end_transaction(trans, root);
3324 }
3325
3326 /*
3327  * find the highest existing sequence number in a directory
3328  * and then set the in-memory index_cnt variable to reflect
3329  * free sequence numbers
3330  */
3331 static int btrfs_set_inode_index_count(struct inode *inode)
3332 {
3333         struct btrfs_root *root = BTRFS_I(inode)->root;
3334         struct btrfs_key key, found_key;
3335         struct btrfs_path *path;
3336         struct extent_buffer *leaf;
3337         int ret;
3338
3339         key.objectid = inode->i_ino;
3340         btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
3341         key.offset = (u64)-1;
3342
3343         path = btrfs_alloc_path();
3344         if (!path)
3345                 return -ENOMEM;
3346
3347         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3348         if (ret < 0)
3349                 goto out;
3350         /* FIXME: we should be able to handle this */
3351         if (ret == 0)
3352                 goto out;
3353         ret = 0;
3354
3355         /*
3356          * MAGIC NUMBER EXPLANATION:
3357          * since we search a directory based on f_pos we have to start at 2
3358          * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
3359          * else has to start at 2
3360          */
3361         if (path->slots[0] == 0) {
3362                 BTRFS_I(inode)->index_cnt = 2;
3363                 goto out;
3364         }
3365
3366         path->slots[0]--;
3367
3368         leaf = path->nodes[0];
3369         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3370
3371         if (found_key.objectid != inode->i_ino ||
3372             btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
3373                 BTRFS_I(inode)->index_cnt = 2;
3374                 goto out;
3375         }
3376
3377         BTRFS_I(inode)->index_cnt = found_key.offset + 1;
3378 out:
3379         btrfs_free_path(path);
3380         return ret;
3381 }
3382
3383 /*
3384  * helper to find a free sequence number in a given directory.  This current
3385  * code is very simple, later versions will do smarter things in the btree
3386  */
3387 int btrfs_set_inode_index(struct inode *dir, u64 *index)
3388 {
3389         int ret = 0;
3390
3391         if (BTRFS_I(dir)->index_cnt == (u64)-1) {
3392                 ret = btrfs_set_inode_index_count(dir);
3393                 if (ret)
3394                         return ret;
3395         }
3396
3397         *index = BTRFS_I(dir)->index_cnt;
3398         BTRFS_I(dir)->index_cnt++;
3399
3400         return ret;
3401 }
3402
3403 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3404                                      struct btrfs_root *root,
3405                                      struct inode *dir,
3406                                      const char *name, int name_len,
3407                                      u64 ref_objectid, u64 objectid,
3408                                      u64 alloc_hint, int mode, u64 *index)
3409 {
3410         struct inode *inode;
3411         struct btrfs_inode_item *inode_item;
3412         struct btrfs_key *location;
3413         struct btrfs_path *path;
3414         struct btrfs_inode_ref *ref;
3415         struct btrfs_key key[2];
3416         u32 sizes[2];
3417         unsigned long ptr;
3418         int ret;
3419         int owner;
3420
3421         path = btrfs_alloc_path();
3422         BUG_ON(!path);
3423
3424         inode = new_inode(root->fs_info->sb);
3425         if (!inode)
3426                 return ERR_PTR(-ENOMEM);
3427
3428         if (dir) {
3429                 ret = btrfs_set_inode_index(dir, index);
3430                 if (ret)
3431                         return ERR_PTR(ret);
3432         }
3433         /*
3434          * index_cnt is ignored for everything but a dir,
3435          * btrfs_get_inode_index_count has an explanation for the magic
3436          * number
3437          */
3438         init_btrfs_i(inode);
3439         BTRFS_I(inode)->index_cnt = 2;
3440         BTRFS_I(inode)->root = root;
3441         BTRFS_I(inode)->generation = trans->transid;
3442
3443         if (mode & S_IFDIR)
3444                 owner = 0;
3445         else
3446                 owner = 1;
3447         BTRFS_I(inode)->block_group =
3448                         btrfs_find_block_group(root, 0, alloc_hint, owner);
3449         if ((mode & S_IFREG)) {
3450                 if (btrfs_test_opt(root, NODATASUM))
3451                         btrfs_set_flag(inode, NODATASUM);
3452                 if (btrfs_test_opt(root, NODATACOW))
3453                         btrfs_set_flag(inode, NODATACOW);
3454         }
3455
3456         key[0].objectid = objectid;
3457         btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
3458         key[0].offset = 0;
3459
3460         key[1].objectid = objectid;
3461         btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
3462         key[1].offset = ref_objectid;
3463
3464         sizes[0] = sizeof(struct btrfs_inode_item);
3465         sizes[1] = name_len + sizeof(*ref);
3466
3467         ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
3468         if (ret != 0)
3469                 goto fail;
3470
3471         if (objectid > root->highest_inode)
3472                 root->highest_inode = objectid;
3473
3474         inode->i_uid = current_fsuid();
3475
3476         if (dir->i_mode & S_ISGID) {
3477                 inode->i_gid = dir->i_gid;
3478                 if (S_ISDIR(mode))
3479                         mode |= S_ISGID;
3480         } else
3481                 inode->i_gid = current_fsgid();
3482
3483         inode->i_mode = mode;
3484         inode->i_ino = objectid;
3485         inode_set_bytes(inode, 0);
3486         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
3487         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3488                                   struct btrfs_inode_item);
3489         fill_inode_item(trans, path->nodes[0], inode_item, inode);
3490
3491         ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3492                              struct btrfs_inode_ref);
3493         btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
3494         btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
3495         ptr = (unsigned long)(ref + 1);
3496         write_extent_buffer(path->nodes[0], name, ptr, name_len);
3497
3498         btrfs_mark_buffer_dirty(path->nodes[0]);
3499         btrfs_free_path(path);
3500
3501         location = &BTRFS_I(inode)->location;
3502         location->objectid = objectid;
3503         location->offset = 0;
3504         btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
3505
3506         insert_inode_hash(inode);
3507         return inode;
3508 fail:
3509         if (dir)
3510                 BTRFS_I(dir)->index_cnt--;
3511         btrfs_free_path(path);
3512         return ERR_PTR(ret);
3513 }
3514
3515 static inline u8 btrfs_inode_type(struct inode *inode)
3516 {
3517         return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
3518 }
3519
3520 /*
3521  * utility function to add 'inode' into 'parent_inode' with
3522  * a give name and a given sequence number.
3523  * if 'add_backref' is true, also insert a backref from the
3524  * inode to the parent directory.
3525  */
3526 int btrfs_add_link(struct btrfs_trans_handle *trans,
3527                    struct inode *parent_inode, struct inode *inode,
3528                    const char *name, int name_len, int add_backref, u64 index)
3529 {
3530         int ret;
3531         struct btrfs_key key;
3532         struct btrfs_root *root = BTRFS_I(parent_inode)->root;
3533
3534         key.objectid = inode->i_ino;
3535         btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
3536         key.offset = 0;
3537
3538         ret = btrfs_insert_dir_item(trans, root, name, name_len,
3539                                     parent_inode->i_ino,
3540                                     &key, btrfs_inode_type(inode),
3541                                     index);
3542         if (ret == 0) {
3543                 if (add_backref) {
3544                         ret = btrfs_insert_inode_ref(trans, root,
3545                                                      name, name_len,
3546                                                      inode->i_ino,
3547                                                      parent_inode->i_ino,
3548                                                      index);
3549                 }
3550                 btrfs_i_size_write(parent_inode, parent_inode->i_size +
3551                                    name_len * 2);
3552                 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
3553                 ret = btrfs_update_inode(trans, root, parent_inode);
3554         }
3555         return ret;
3556 }
3557
3558 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
3559                             struct dentry *dentry, struct inode *inode,
3560                             int backref, u64 index)
3561 {
3562         int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3563                                  inode, dentry->d_name.name,
3564                                  dentry->d_name.len, backref, index);
3565         if (!err) {
3566                 d_instantiate(dentry, inode);
3567                 return 0;
3568         }
3569         if (err > 0)
3570                 err = -EEXIST;
3571         return err;
3572 }
3573
3574 static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3575                         int mode, dev_t rdev)
3576 {
3577         struct btrfs_trans_handle *trans;
3578         struct btrfs_root *root = BTRFS_I(dir)->root;
3579         struct inode *inode = NULL;
3580         int err;
3581         int drop_inode = 0;
3582         u64 objectid;
3583         unsigned long nr = 0;
3584         u64 index = 0;
3585
3586         if (!new_valid_dev(rdev))
3587                 return -EINVAL;
3588
3589         err = btrfs_check_free_space(root, 1, 0);
3590         if (err)
3591                 goto fail;
3592
3593         trans = btrfs_start_transaction(root, 1);
3594         btrfs_set_trans_block_group(trans, dir);
3595
3596         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3597         if (err) {
3598                 err = -ENOSPC;
3599                 goto out_unlock;
3600         }
3601
3602         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3603                                 dentry->d_name.len,
3604                                 dentry->d_parent->d_inode->i_ino, objectid,
3605                                 BTRFS_I(dir)->block_group, mode, &index);
3606         err = PTR_ERR(inode);
3607         if (IS_ERR(inode))
3608                 goto out_unlock;
3609
3610         err = btrfs_init_inode_security(inode, dir);
3611         if (err) {
3612                 drop_inode = 1;
3613                 goto out_unlock;
3614         }
3615
3616         btrfs_set_trans_block_group(trans, inode);
3617         err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3618         if (err)
3619                 drop_inode = 1;
3620         else {
3621                 inode->i_op = &btrfs_special_inode_operations;
3622                 init_special_inode(inode, inode->i_mode, rdev);
3623                 btrfs_update_inode(trans, root, inode);
3624         }
3625         dir->i_sb->s_dirt = 1;
3626         btrfs_update_inode_block_group(trans, inode);
3627         btrfs_update_inode_block_group(trans, dir);
3628 out_unlock:
3629         nr = trans->blocks_used;
3630         btrfs_end_transaction_throttle(trans, root);
3631 fail:
3632         if (drop_inode) {
3633                 inode_dec_link_count(inode);
3634                 iput(inode);
3635         }
3636         btrfs_btree_balance_dirty(root, nr);
3637         return err;
3638 }
3639
3640 static int btrfs_create(struct inode *dir, struct dentry *dentry,
3641                         int mode, struct nameidata *nd)
3642 {
3643         struct btrfs_trans_handle *trans;
3644         struct btrfs_root *root = BTRFS_I(dir)->root;
3645         struct inode *inode = NULL;
3646         int err;
3647         int drop_inode = 0;
3648         unsigned long nr = 0;
3649         u64 objectid;
3650         u64 index = 0;
3651
3652         err = btrfs_check_free_space(root, 1, 0);
3653         if (err)
3654                 goto fail;
3655         trans = btrfs_start_transaction(root, 1);
3656         btrfs_set_trans_block_group(trans, dir);
3657
3658         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3659         if (err) {
3660                 err = -ENOSPC;
3661                 goto out_unlock;
3662         }
3663
3664         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3665                                 dentry->d_name.len,
3666                                 dentry->d_parent->d_inode->i_ino,
3667                                 objectid, BTRFS_I(dir)->block_group, mode,
3668                                 &index);
3669         err = PTR_ERR(inode);
3670         if (IS_ERR(inode))
3671                 goto out_unlock;
3672
3673         err = btrfs_init_inode_security(inode, dir);
3674         if (err) {
3675                 drop_inode = 1;
3676                 goto out_unlock;
3677         }
3678
3679         btrfs_set_trans_block_group(trans, inode);
3680         err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3681         if (err)
3682                 drop_inode = 1;
3683         else {
3684                 inode->i_mapping->a_ops = &btrfs_aops;
3685                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3686                 inode->i_fop = &btrfs_file_operations;
3687                 inode->i_op = &btrfs_file_inode_operations;
3688                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3689         }
3690         dir->i_sb->s_dirt = 1;
3691         btrfs_update_inode_block_group(trans, inode);
3692         btrfs_update_inode_block_group(trans, dir);
3693 out_unlock:
3694         nr = trans->blocks_used;
3695         btrfs_end_transaction_throttle(trans, root);
3696 fail:
3697         if (drop_inode) {
3698                 inode_dec_link_count(inode);
3699                 iput(inode);
3700         }
3701         btrfs_btree_balance_dirty(root, nr);
3702         return err;
3703 }
3704
3705 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3706                       struct dentry *dentry)
3707 {
3708         struct btrfs_trans_handle *trans;
3709         struct btrfs_root *root = BTRFS_I(dir)->root;
3710         struct inode *inode = old_dentry->d_inode;
3711         u64 index;
3712         unsigned long nr = 0;
3713         int err;
3714         int drop_inode = 0;
3715
3716         if (inode->i_nlink == 0)
3717                 return -ENOENT;
3718
3719         btrfs_inc_nlink(inode);
3720         err = btrfs_check_free_space(root, 1, 0);
3721         if (err)
3722                 goto fail;
3723         err = btrfs_set_inode_index(dir, &index);
3724         if (err)
3725                 goto fail;
3726
3727         trans = btrfs_start_transaction(root, 1);
3728
3729         btrfs_set_trans_block_group(trans, dir);
3730         atomic_inc(&inode->i_count);
3731
3732         err = btrfs_add_nondir(trans, dentry, inode, 1, index);
3733
3734         if (err)
3735                 drop_inode = 1;
3736
3737         dir->i_sb->s_dirt = 1;
3738         btrfs_update_inode_block_group(trans, dir);
3739         err = btrfs_update_inode(trans, root, inode);
3740
3741         if (err)
3742                 drop_inode = 1;
3743
3744         nr = trans->blocks_used;
3745         btrfs_end_transaction_throttle(trans, root);
3746 fail:
3747         if (drop_inode) {
3748                 inode_dec_link_count(inode);
3749                 iput(inode);
3750         }
3751         btrfs_btree_balance_dirty(root, nr);
3752         return err;
3753 }
3754
3755 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3756 {
3757         struct inode *inode = NULL;
3758         struct btrfs_trans_handle *trans;
3759         struct btrfs_root *root = BTRFS_I(dir)->root;
3760         int err = 0;
3761         int drop_on_err = 0;
3762         u64 objectid = 0;
3763         u64 index = 0;
3764         unsigned long nr = 1;
3765
3766         err = btrfs_check_free_space(root, 1, 0);
3767         if (err)
3768                 goto out_unlock;
3769
3770         trans = btrfs_start_transaction(root, 1);
3771         btrfs_set_trans_block_group(trans, dir);
3772
3773         if (IS_ERR(trans)) {
3774                 err = PTR_ERR(trans);
3775                 goto out_unlock;
3776         }
3777
3778         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3779         if (err) {
3780                 err = -ENOSPC;
3781                 goto out_unlock;
3782         }
3783
3784         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3785                                 dentry->d_name.len,
3786                                 dentry->d_parent->d_inode->i_ino, objectid,
3787                                 BTRFS_I(dir)->block_group, S_IFDIR | mode,
3788                                 &index);
3789         if (IS_ERR(inode)) {
3790                 err = PTR_ERR(inode);
3791                 goto out_fail;
3792         }
3793
3794         drop_on_err = 1;
3795
3796         err = btrfs_init_inode_security(inode, dir);
3797         if (err)
3798                 goto out_fail;
3799
3800         inode->i_op = &btrfs_dir_inode_operations;
3801         inode->i_fop = &btrfs_dir_file_operations;
3802         btrfs_set_trans_block_group(trans, inode);
3803
3804         btrfs_i_size_write(inode, 0);
3805         err = btrfs_update_inode(trans, root, inode);
3806         if (err)
3807                 goto out_fail;
3808
3809         err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3810                                  inode, dentry->d_name.name,
3811                                  dentry->d_name.len, 0, index);
3812         if (err)
3813                 goto out_fail;
3814
3815         d_instantiate(dentry, inode);
3816         drop_on_err = 0;
3817         dir->i_sb->s_dirt = 1;
3818         btrfs_update_inode_block_group(trans, inode);
3819         btrfs_update_inode_block_group(trans, dir);
3820
3821 out_fail:
3822         nr = trans->blocks_used;
3823         btrfs_end_transaction_throttle(trans, root);
3824
3825 out_unlock:
3826         if (drop_on_err)
3827                 iput(inode);
3828         btrfs_btree_balance_dirty(root, nr);
3829         return err;
3830 }
3831
3832 /* helper for btfs_get_extent.  Given an existing extent in the tree,
3833  * and an extent that you want to insert, deal with overlap and insert
3834  * the new extent into the tree.
3835  */
3836 static int merge_extent_mapping(struct extent_map_tree *em_tree,
3837                                 struct extent_map *existing,
3838                                 struct extent_map *em,
3839                                 u64 map_start, u64 map_len)
3840 {
3841         u64 start_diff;
3842
3843         BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
3844         start_diff = map_start - em->start;
3845         em->start = map_start;
3846         em->len = map_len;
3847         if (em->block_start < EXTENT_MAP_LAST_BYTE &&
3848             !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
3849                 em->block_start += start_diff;
3850                 em->block_len -= start_diff;
3851         }
3852         return add_extent_mapping(em_tree, em);
3853 }
3854
3855 static noinline int uncompress_inline(struct btrfs_path *path,
3856                                       struct inode *inode, struct page *page,
3857                                       size_t pg_offset, u64 extent_offset,
3858                                       struct btrfs_file_extent_item *item)
3859 {
3860         int ret;
3861         struct extent_buffer *leaf = path->nodes[0];
3862         char *tmp;
3863         size_t max_size;
3864         unsigned long inline_size;
3865         unsigned long ptr;
3866
3867         WARN_ON(pg_offset != 0);
3868         max_size = btrfs_file_extent_ram_bytes(leaf, item);
3869         inline_size = btrfs_file_extent_inline_item_len(leaf,
3870                                         btrfs_item_nr(leaf, path->slots[0]));
3871         tmp = kmalloc(inline_size, GFP_NOFS);
3872         ptr = btrfs_file_extent_inline_start(item);
3873
3874         read_extent_buffer(leaf, tmp, ptr, inline_size);
3875
3876         max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
3877         ret = btrfs_zlib_decompress(tmp, page, extent_offset,
3878                                     inline_size, max_size);
3879         if (ret) {
3880                 char *kaddr = kmap_atomic(page, KM_USER0);
3881                 unsigned long copy_size = min_t(u64,
3882                                   PAGE_CACHE_SIZE - pg_offset,
3883                                   max_size - extent_offset);
3884                 memset(kaddr + pg_offset, 0, copy_size);
3885                 kunmap_atomic(kaddr, KM_USER0);
3886         }
3887         kfree(tmp);
3888         return 0;
3889 }
3890
3891 /*
3892  * a bit scary, this does extent mapping from logical file offset to the disk.
3893  * the ugly parts come from merging extents from the disk with the in-ram
3894  * representation.  This gets more complex because of the data=ordered code,
3895  * where the in-ram extents might be locked pending data=ordered completion.
3896  *
3897  * This also copies inline extents directly into the page.
3898  */
3899
3900 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
3901                                     size_t pg_offset, u64 start, u64 len,
3902                                     int create)
3903 {
3904         int ret;
3905         int err = 0;
3906         u64 bytenr;
3907         u64 extent_start = 0;
3908         u64 extent_end = 0;
3909         u64 objectid = inode->i_ino;
3910         u32 found_type;
3911         struct btrfs_path *path = NULL;
3912         struct btrfs_root *root = BTRFS_I(inode)->root;
3913         struct btrfs_file_extent_item *item;
3914         struct extent_buffer *leaf;
3915         struct btrfs_key found_key;
3916         struct extent_map *em = NULL;
3917         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3918         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3919         struct btrfs_trans_handle *trans = NULL;
3920         int compressed;
3921
3922 again:
3923         spin_lock(&em_tree->lock);
3924         em = lookup_extent_mapping(em_tree, start, len);
3925         if (em)
3926                 em->bdev = root->fs_info->fs_devices->latest_bdev;
3927         spin_unlock(&em_tree->lock);
3928
3929         if (em) {
3930                 if (em->start > start || em->start + em->len <= start)
3931                         free_extent_map(em);
3932                 else if (em->block_start == EXTENT_MAP_INLINE && page)
3933                         free_extent_map(em);
3934                 else
3935                         goto out;
3936         }
3937         em = alloc_extent_map(GFP_NOFS);
3938         if (!em) {
3939                 err = -ENOMEM;
3940                 goto out;
3941         }
3942         em->bdev = root->fs_info->fs_devices->latest_bdev;
3943         em->start = EXTENT_MAP_HOLE;
3944         em->orig_start = EXTENT_MAP_HOLE;
3945         em->len = (u64)-1;
3946         em->block_len = (u64)-1;
3947
3948         if (!path) {
3949                 path = btrfs_alloc_path();
3950                 BUG_ON(!path);
3951         }
3952
3953         ret = btrfs_lookup_file_extent(trans, root, path,
3954                                        objectid, start, trans != NULL);
3955         if (ret < 0) {
3956                 err = ret;
3957                 goto out;
3958         }
3959
3960         if (ret != 0) {
3961                 if (path->slots[0] == 0)
3962                         goto not_found;
3963                 path->slots[0]--;
3964         }
3965
3966         leaf = path->nodes[0];
3967         item = btrfs_item_ptr(leaf, path->slots[0],
3968                               struct btrfs_file_extent_item);
3969         /* are we inside the extent that was found? */
3970         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3971         found_type = btrfs_key_type(&found_key);
3972         if (found_key.objectid != objectid ||
3973             found_type != BTRFS_EXTENT_DATA_KEY) {
3974                 goto not_found;
3975         }
3976
3977         found_type = btrfs_file_extent_type(leaf, item);
3978         extent_start = found_key.offset;
3979         compressed = btrfs_file_extent_compression(leaf, item);
3980         if (found_type == BTRFS_FILE_EXTENT_REG ||
3981             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
3982                 extent_end = extent_start +
3983                        btrfs_file_extent_num_bytes(leaf, item);
3984         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
3985                 size_t size;
3986                 size = btrfs_file_extent_inline_len(leaf, item);
3987                 extent_end = (extent_start + size + root->sectorsize - 1) &
3988                         ~((u64)root->sectorsize - 1);
3989         }
3990
3991         if (start >= extent_end) {
3992                 path->slots[0]++;
3993                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3994                         ret = btrfs_next_leaf(root, path);
3995                         if (ret < 0) {
3996                                 err = ret;
3997                                 goto out;
3998                         }
3999                         if (ret > 0)
4000                                 goto not_found;
4001                         leaf = path->nodes[0];
4002                 }
4003                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4004                 if (found_key.objectid != objectid ||
4005                     found_key.type != BTRFS_EXTENT_DATA_KEY)
4006                         goto not_found;
4007                 if (start + len <= found_key.offset)
4008                         goto not_found;
4009                 em->start = start;
4010                 em->len = found_key.offset - start;
4011                 goto not_found_em;
4012         }
4013
4014         if (found_type == BTRFS_FILE_EXTENT_REG ||
4015             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
4016                 em->start = extent_start;
4017                 em->len = extent_end - extent_start;
4018                 em->orig_start = extent_start -
4019                                  btrfs_file_extent_offset(leaf, item);
4020                 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
4021                 if (bytenr == 0) {
4022                         em->block_start = EXTENT_MAP_HOLE;
4023                         goto insert;
4024                 }
4025                 if (compressed) {
4026                         set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4027                         em->block_start = bytenr;
4028                         em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
4029                                                                          item);
4030                 } else {
4031                         bytenr += btrfs_file_extent_offset(leaf, item);
4032                         em->block_start = bytenr;
4033                         em->block_len = em->len;
4034                         if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
4035                                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
4036                 }
4037                 goto insert;
4038         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
4039                 unsigned long ptr;
4040                 char *map;
4041                 size_t size;
4042                 size_t extent_offset;
4043                 size_t copy_size;
4044
4045                 em->block_start = EXTENT_MAP_INLINE;
4046                 if (!page || create) {
4047                         em->start = extent_start;
4048                         em->len = extent_end - extent_start;
4049                         goto out;
4050                 }
4051
4052                 size = btrfs_file_extent_inline_len(leaf, item);
4053                 extent_offset = page_offset(page) + pg_offset - extent_start;
4054                 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
4055                                 size - extent_offset);
4056                 em->start = extent_start + extent_offset;
4057                 em->len = (copy_size + root->sectorsize - 1) &
4058                         ~((u64)root->sectorsize - 1);
4059                 em->orig_start = EXTENT_MAP_INLINE;
4060                 if (compressed)
4061                         set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4062                 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
4063                 if (create == 0 && !PageUptodate(page)) {
4064                         if (btrfs_file_extent_compression(leaf, item) ==
4065                             BTRFS_COMPRESS_ZLIB) {
4066                                 ret = uncompress_inline(path, inode, page,
4067                                                         pg_offset,
4068                                                         extent_offset, item);
4069                                 BUG_ON(ret);
4070                         } else {
4071                                 map = kmap(page);
4072                                 read_extent_buffer(leaf, map + pg_offset, ptr,
4073                                                    copy_size);
4074                                 kunmap(page);
4075                         }
4076                         flush_dcache_page(page);
4077                 } else if (create && PageUptodate(page)) {
4078                         if (!trans) {
4079                                 kunmap(page);
4080                                 free_extent_map(em);
4081                                 em = NULL;
4082                                 btrfs_release_path(root, path);
4083                                 trans = btrfs_join_transaction(root, 1);
4084                                 goto again;
4085                         }
4086                         map = kmap(page);
4087                         write_extent_buffer(leaf, map + pg_offset, ptr,
4088                                             copy_size);
4089                         kunmap(page);
4090                         btrfs_mark_buffer_dirty(leaf);
4091                 }
4092                 set_extent_uptodate(io_tree, em->start,
4093                                     extent_map_end(em) - 1, GFP_NOFS);
4094                 goto insert;
4095         } else {
4096                 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
4097                 WARN_ON(1);
4098         }
4099 not_found:
4100         em->start = start;
4101         em->len = len;
4102 not_found_em:
4103         em->block_start = EXTENT_MAP_HOLE;
4104         set_bit(EXTENT_FLAG_VACANCY, &em->flags);
4105 insert:
4106         btrfs_release_path(root, path);
4107         if (em->start > start || extent_map_end(em) <= start) {
4108                 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
4109                        "[%llu %llu]\n", (unsigned long long)em->start,
4110                        (unsigned long long)em->len,
4111                        (unsigned long long)start,
4112                        (unsigned long long)len);
4113                 err = -EIO;
4114                 goto out;
4115         }
4116
4117         err = 0;
4118         spin_lock(&em_tree->lock);
4119         ret = add_extent_mapping(em_tree, em);
4120         /* it is possible that someone inserted the extent into the tree
4121          * while we had the lock dropped.  It is also possible that
4122          * an overlapping map exists in the tree
4123          */
4124         if (ret == -EEXIST) {
4125                 struct extent_map *existing;
4126
4127                 ret = 0;
4128
4129                 existing = lookup_extent_mapping(em_tree, start, len);
4130                 if (existing && (existing->start > start ||
4131                     existing->start + existing->len <= start)) {
4132                         free_extent_map(existing);
4133                         existing = NULL;
4134                 }
4135                 if (!existing) {
4136                         existing = lookup_extent_mapping(em_tree, em->start,
4137                                                          em->len);
4138                         if (existing) {
4139                                 err = merge_extent_mapping(em_tree, existing,
4140                                                            em, start,
4141                                                            root->sectorsize);
4142                                 free_extent_map(existing);
4143                                 if (err) {
4144                                         free_extent_map(em);
4145                                         em = NULL;
4146                                 }
4147                         } else {
4148                                 err = -EIO;
4149                                 free_extent_map(em);
4150                                 em = NULL;
4151                         }
4152                 } else {
4153                         free_extent_map(em);
4154                         em = existing;
4155                         err = 0;
4156                 }
4157         }
4158         spin_unlock(&em_tree->lock);
4159 out:
4160         if (path)
4161                 btrfs_free_path(path);
4162         if (trans) {
4163                 ret = btrfs_end_transaction(trans, root);
4164                 if (!err)
4165                         err = ret;
4166         }
4167         if (err) {
4168                 free_extent_map(em);
4169                 WARN_ON(1);
4170                 return ERR_PTR(err);
4171         }
4172         return em;
4173 }
4174
4175 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4176                         const struct iovec *iov, loff_t offset,
4177                         unsigned long nr_segs)
4178 {
4179         return -EINVAL;
4180 }
4181
4182 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4183                 __u64 start, __u64 len)
4184 {
4185         return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
4186 }
4187
4188 int btrfs_readpage(struct file *file, struct page *page)
4189 {
4190         struct extent_io_tree *tree;
4191         tree = &BTRFS_I(page->mapping->host)->io_tree;
4192         return extent_read_full_page(tree, page, btrfs_get_extent);
4193 }
4194
4195 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
4196 {
4197         struct extent_io_tree *tree;
4198
4199
4200         if (current->flags & PF_MEMALLOC) {
4201                 redirty_page_for_writepage(wbc, page);
4202                 unlock_page(page);
4203                 return 0;
4204         }
4205         tree = &BTRFS_I(page->mapping->host)->io_tree;
4206         return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
4207 }
4208
4209 int btrfs_writepages(struct address_space *mapping,
4210                      struct writeback_control *wbc)
4211 {
4212         struct extent_io_tree *tree;
4213
4214         tree = &BTRFS_I(mapping->host)->io_tree;
4215         return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
4216 }
4217
4218 static int
4219 btrfs_readpages(struct file *file, struct address_space *mapping,
4220                 struct list_head *pages, unsigned nr_pages)
4221 {
4222         struct extent_io_tree *tree;
4223         tree = &BTRFS_I(mapping->host)->io_tree;
4224         return extent_readpages(tree, mapping, pages, nr_pages,
4225                                 btrfs_get_extent);
4226 }
4227 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4228 {
4229         struct extent_io_tree *tree;
4230         struct extent_map_tree *map;
4231         int ret;
4232
4233         tree = &BTRFS_I(page->mapping->host)->io_tree;
4234         map = &BTRFS_I(page->mapping->host)->extent_tree;
4235         ret = try_release_extent_mapping(map, tree, page, gfp_flags);
4236         if (ret == 1) {
4237                 ClearPagePrivate(page);
4238                 set_page_private(page, 0);
4239                 page_cache_release(page);
4240         }
4241         return ret;
4242 }
4243
4244 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4245 {
4246         if (PageWriteback(page) || PageDirty(page))
4247                 return 0;
4248         return __btrfs_releasepage(page, gfp_flags);
4249 }
4250
4251 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4252 {
4253         struct extent_io_tree *tree;
4254         struct btrfs_ordered_extent *ordered;
4255         u64 page_start = page_offset(page);
4256         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
4257
4258         wait_on_page_writeback(page);
4259         tree = &BTRFS_I(page->mapping->host)->io_tree;
4260         if (offset) {
4261                 btrfs_releasepage(page, GFP_NOFS);
4262                 return;
4263         }
4264
4265         lock_extent(tree, page_start, page_end, GFP_NOFS);
4266         ordered = btrfs_lookup_ordered_extent(page->mapping->host,
4267                                            page_offset(page));
4268         if (ordered) {
4269                 /*
4270                  * IO on this page will never be started, so we need
4271                  * to account for any ordered extents now
4272                  */
4273                 clear_extent_bit(tree, page_start, page_end,
4274                                  EXTENT_DIRTY | EXTENT_DELALLOC |
4275                                  EXTENT_LOCKED, 1, 0, GFP_NOFS);
4276                 btrfs_finish_ordered_io(page->mapping->host,
4277                                         page_start, page_end);
4278                 btrfs_put_ordered_extent(ordered);
4279                 lock_extent(tree, page_start, page_end, GFP_NOFS);
4280         }
4281         clear_extent_bit(tree, page_start, page_end,
4282                  EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4283                  EXTENT_ORDERED,
4284                  1, 1, GFP_NOFS);
4285         __btrfs_releasepage(page, GFP_NOFS);
4286
4287         ClearPageChecked(page);
4288         if (PagePrivate(page)) {
4289                 ClearPagePrivate(page);
4290                 set_page_private(page, 0);
4291                 page_cache_release(page);
4292         }
4293 }
4294
4295 /*
4296  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
4297  * called from a page fault handler when a page is first dirtied. Hence we must
4298  * be careful to check for EOF conditions here. We set the page up correctly
4299  * for a written page which means we get ENOSPC checking when writing into
4300  * holes and correct delalloc and unwritten extent mapping on filesystems that
4301  * support these features.
4302  *
4303  * We are not allowed to take the i_mutex here so we have to play games to
4304  * protect against truncate races as the page could now be beyond EOF.  Because
4305  * vmtruncate() writes the inode size before removing pages, once we have the
4306  * page lock we can determine safely if the page is beyond EOF. If it is not
4307  * beyond EOF, then the page is guaranteed safe against truncation until we
4308  * unlock the page.
4309  */
4310 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4311 {
4312         struct inode *inode = fdentry(vma->vm_file)->d_inode;
4313         struct btrfs_root *root = BTRFS_I(inode)->root;
4314         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4315         struct btrfs_ordered_extent *ordered;
4316         char *kaddr;
4317         unsigned long zero_start;
4318         loff_t size;
4319         int ret;
4320         u64 page_start;
4321         u64 page_end;
4322
4323         ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
4324         if (ret)
4325                 goto out;
4326
4327         ret = -EINVAL;
4328 again:
4329         lock_page(page);
4330         size = i_size_read(inode);
4331         page_start = page_offset(page);
4332         page_end = page_start + PAGE_CACHE_SIZE - 1;
4333
4334         if ((page->mapping != inode->i_mapping) ||
4335             (page_start >= size)) {
4336                 /* page got truncated out from underneath us */
4337                 goto out_unlock;
4338         }
4339         wait_on_page_writeback(page);
4340
4341         lock_extent(io_tree, page_start, page_end, GFP_NOFS);
4342         set_page_extent_mapped(page);
4343
4344         /*
4345          * we can't set the delalloc bits if there are pending ordered
4346          * extents.  Drop our locks and wait for them to finish
4347          */
4348         ordered = btrfs_lookup_ordered_extent(inode, page_start);
4349         if (ordered) {
4350                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4351                 unlock_page(page);
4352                 btrfs_start_ordered_extent(inode, ordered, 1);
4353                 btrfs_put_ordered_extent(ordered);
4354                 goto again;
4355         }
4356
4357         btrfs_set_extent_delalloc(inode, page_start, page_end);
4358         ret = 0;
4359
4360         /* page is wholly or partially inside EOF */
4361         if (page_start + PAGE_CACHE_SIZE > size)
4362                 zero_start = size & ~PAGE_CACHE_MASK;
4363         else
4364                 zero_start = PAGE_CACHE_SIZE;
4365
4366         if (zero_start != PAGE_CACHE_SIZE) {
4367                 kaddr = kmap(page);
4368                 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
4369                 flush_dcache_page(page);
4370                 kunmap(page);
4371         }
4372         ClearPageChecked(page);
4373         set_page_dirty(page);
4374         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4375
4376 out_unlock:
4377         unlock_page(page);
4378 out:
4379         return ret;
4380 }
4381
4382 static void btrfs_truncate(struct inode *inode)
4383 {
4384         struct btrfs_root *root = BTRFS_I(inode)->root;
4385         int ret;
4386         struct btrfs_trans_handle *trans;
4387         unsigned long nr;
4388         u64 mask = root->sectorsize - 1;
4389
4390         if (!S_ISREG(inode->i_mode))
4391                 return;
4392         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4393                 return;
4394
4395         btrfs_truncate_page(inode->i_mapping, inode->i_size);
4396         btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4397
4398         trans = btrfs_start_transaction(root, 1);
4399         btrfs_set_trans_block_group(trans, inode);
4400         btrfs_i_size_write(inode, inode->i_size);
4401
4402         ret = btrfs_orphan_add(trans, inode);
4403         if (ret)
4404                 goto out;
4405         /* FIXME, add redo link to tree so we don't leak on crash */
4406         ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
4407                                       BTRFS_EXTENT_DATA_KEY);
4408         btrfs_update_inode(trans, root, inode);
4409
4410         ret = btrfs_orphan_del(trans, inode);
4411         BUG_ON(ret);
4412
4413 out:
4414         nr = trans->blocks_used;
4415         ret = btrfs_end_transaction_throttle(trans, root);
4416         BUG_ON(ret);
4417         btrfs_btree_balance_dirty(root, nr);
4418 }
4419
4420 /*
4421  * create a new subvolume directory/inode (helper for the ioctl).
4422  */
4423 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
4424                              struct btrfs_root *new_root, struct dentry *dentry,
4425                              u64 new_dirid, u64 alloc_hint)
4426 {
4427         struct inode *inode;
4428         int error;
4429         u64 index = 0;
4430
4431         inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
4432                                 new_dirid, alloc_hint, S_IFDIR | 0700, &index);
4433         if (IS_ERR(inode))
4434                 return PTR_ERR(inode);
4435         inode->i_op = &btrfs_dir_inode_operations;
4436         inode->i_fop = &btrfs_dir_file_operations;
4437
4438         inode->i_nlink = 1;
4439         btrfs_i_size_write(inode, 0);
4440
4441         error = btrfs_update_inode(trans, new_root, inode);
4442         if (error)
4443                 return error;
4444
4445         d_instantiate(dentry, inode);
4446         return 0;
4447 }
4448
4449 /* helper function for file defrag and space balancing.  This
4450  * forces readahead on a given range of bytes in an inode
4451  */
4452 unsigned long btrfs_force_ra(struct address_space *mapping,
4453                               struct file_ra_state *ra, struct file *file,
4454                               pgoff_t offset, pgoff_t last_index)
4455 {
4456         pgoff_t req_size = last_index - offset + 1;
4457
4458         page_cache_sync_readahead(mapping, ra, file, offset, req_size);
4459         return offset + req_size;
4460 }
4461
4462 struct inode *btrfs_alloc_inode(struct super_block *sb)
4463 {
4464         struct btrfs_inode *ei;
4465
4466         ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
4467         if (!ei)
4468                 return NULL;
4469         ei->last_trans = 0;
4470         ei->logged_trans = 0;
4471         btrfs_ordered_inode_tree_init(&ei->ordered_tree);
4472         ei->i_acl = BTRFS_ACL_NOT_CACHED;
4473         ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4474         INIT_LIST_HEAD(&ei->i_orphan);
4475         return &ei->vfs_inode;
4476 }
4477
4478 void btrfs_destroy_inode(struct inode *inode)
4479 {
4480         struct btrfs_ordered_extent *ordered;
4481         WARN_ON(!list_empty(&inode->i_dentry));
4482         WARN_ON(inode->i_data.nrpages);
4483
4484         if (BTRFS_I(inode)->i_acl &&
4485             BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
4486                 posix_acl_release(BTRFS_I(inode)->i_acl);
4487         if (BTRFS_I(inode)->i_default_acl &&
4488             BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4489                 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4490
4491         spin_lock(&BTRFS_I(inode)->root->list_lock);
4492         if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4493                 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4494                        " list\n", inode->i_ino);
4495                 dump_stack();
4496         }
4497         spin_unlock(&BTRFS_I(inode)->root->list_lock);
4498
4499         while (1) {
4500                 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
4501                 if (!ordered)
4502                         break;
4503                 else {
4504                         printk(KERN_ERR "btrfs found ordered "
4505                                "extent %llu %llu on inode cleanup\n",
4506                                (unsigned long long)ordered->file_offset,
4507                                (unsigned long long)ordered->len);
4508                         btrfs_remove_ordered_extent(inode, ordered);
4509                         btrfs_put_ordered_extent(ordered);
4510                         btrfs_put_ordered_extent(ordered);
4511                 }
4512         }
4513         btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
4514         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
4515 }
4516
4517 static void init_once(void *foo)
4518 {
4519         struct btrfs_inode *ei = (struct btrfs_inode *) foo;
4520
4521         inode_init_once(&ei->vfs_inode);
4522 }
4523
4524 void btrfs_destroy_cachep(void)
4525 {
4526         if (btrfs_inode_cachep)
4527                 kmem_cache_destroy(btrfs_inode_cachep);
4528         if (btrfs_trans_handle_cachep)
4529                 kmem_cache_destroy(btrfs_trans_handle_cachep);
4530         if (btrfs_transaction_cachep)
4531                 kmem_cache_destroy(btrfs_transaction_cachep);
4532         if (btrfs_bit_radix_cachep)
4533                 kmem_cache_destroy(btrfs_bit_radix_cachep);
4534         if (btrfs_path_cachep)
4535                 kmem_cache_destroy(btrfs_path_cachep);
4536 }
4537
4538 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
4539                                        unsigned long extra_flags,
4540                                        void (*ctor)(void *))
4541 {
4542         return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
4543                                  SLAB_MEM_SPREAD | extra_flags), ctor);
4544 }
4545
4546 int btrfs_init_cachep(void)
4547 {
4548         btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
4549                                           sizeof(struct btrfs_inode),
4550                                           0, init_once);
4551         if (!btrfs_inode_cachep)
4552                 goto fail;
4553         btrfs_trans_handle_cachep =
4554                         btrfs_cache_create("btrfs_trans_handle_cache",
4555                                            sizeof(struct btrfs_trans_handle),
4556                                            0, NULL);
4557         if (!btrfs_trans_handle_cachep)
4558                 goto fail;
4559         btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
4560                                              sizeof(struct btrfs_transaction),
4561                                              0, NULL);
4562         if (!btrfs_transaction_cachep)
4563                 goto fail;
4564         btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
4565                                          sizeof(struct btrfs_path),
4566                                          0, NULL);
4567         if (!btrfs_path_cachep)
4568                 goto fail;
4569         btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
4570                                               SLAB_DESTROY_BY_RCU, NULL);
4571         if (!btrfs_bit_radix_cachep)
4572                 goto fail;
4573         return 0;
4574 fail:
4575         btrfs_destroy_cachep();
4576         return -ENOMEM;
4577 }
4578
4579 static int btrfs_getattr(struct vfsmount *mnt,
4580                          struct dentry *dentry, struct kstat *stat)
4581 {
4582         struct inode *inode = dentry->d_inode;
4583         generic_fillattr(inode, stat);
4584         stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
4585         stat->blksize = PAGE_CACHE_SIZE;
4586         stat->blocks = (inode_get_bytes(inode) +
4587                         BTRFS_I(inode)->delalloc_bytes) >> 9;
4588         return 0;
4589 }
4590
4591 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4592                            struct inode *new_dir, struct dentry *new_dentry)
4593 {
4594         struct btrfs_trans_handle *trans;
4595         struct btrfs_root *root = BTRFS_I(old_dir)->root;
4596         struct inode *new_inode = new_dentry->d_inode;
4597         struct inode *old_inode = old_dentry->d_inode;
4598         struct timespec ctime = CURRENT_TIME;
4599         u64 index = 0;
4600         int ret;
4601
4602         /* we're not allowed to rename between subvolumes */
4603         if (BTRFS_I(old_inode)->root->root_key.objectid !=
4604             BTRFS_I(new_dir)->root->root_key.objectid)
4605                 return -EXDEV;
4606
4607         if (S_ISDIR(old_inode->i_mode) && new_inode &&
4608             new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
4609                 return -ENOTEMPTY;
4610         }
4611
4612         /* to rename a snapshot or subvolume, we need to juggle the
4613          * backrefs.  This isn't coded yet
4614          */
4615         if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
4616                 return -EXDEV;
4617
4618         ret = btrfs_check_free_space(root, 1, 0);
4619         if (ret)
4620                 goto out_unlock;
4621
4622         trans = btrfs_start_transaction(root, 1);
4623
4624         btrfs_set_trans_block_group(trans, new_dir);
4625
4626         btrfs_inc_nlink(old_dentry->d_inode);
4627         old_dir->i_ctime = old_dir->i_mtime = ctime;
4628         new_dir->i_ctime = new_dir->i_mtime = ctime;
4629         old_inode->i_ctime = ctime;
4630
4631         ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
4632                                  old_dentry->d_name.name,
4633                                  old_dentry->d_name.len);
4634         if (ret)
4635                 goto out_fail;
4636
4637         if (new_inode) {
4638                 new_inode->i_ctime = CURRENT_TIME;
4639                 ret = btrfs_unlink_inode(trans, root, new_dir,
4640                                          new_dentry->d_inode,
4641                                          new_dentry->d_name.name,
4642                                          new_dentry->d_name.len);
4643                 if (ret)
4644                         goto out_fail;
4645                 if (new_inode->i_nlink == 0) {
4646                         ret = btrfs_orphan_add(trans, new_dentry->d_inode);
4647                         if (ret)
4648                                 goto out_fail;
4649                 }
4650
4651         }
4652         ret = btrfs_set_inode_index(new_dir, &index);
4653         if (ret)
4654                 goto out_fail;
4655
4656         ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
4657                              old_inode, new_dentry->d_name.name,
4658                              new_dentry->d_name.len, 1, index);
4659         if (ret)
4660                 goto out_fail;
4661
4662 out_fail:
4663         btrfs_end_transaction_throttle(trans, root);
4664 out_unlock:
4665         return ret;
4666 }
4667
4668 /*
4669  * some fairly slow code that needs optimization. This walks the list
4670  * of all the inodes with pending delalloc and forces them to disk.
4671  */
4672 int btrfs_start_delalloc_inodes(struct btrfs_root *root)
4673 {
4674         struct list_head *head = &root->fs_info->delalloc_inodes;
4675         struct btrfs_inode *binode;
4676         struct inode *inode;
4677
4678         if (root->fs_info->sb->s_flags & MS_RDONLY)
4679                 return -EROFS;
4680
4681         spin_lock(&root->fs_info->delalloc_lock);
4682         while (!list_empty(head)) {
4683                 binode = list_entry(head->next, struct btrfs_inode,
4684                                     delalloc_inodes);
4685                 inode = igrab(&binode->vfs_inode);
4686                 if (!inode)
4687                         list_del_init(&binode->delalloc_inodes);
4688                 spin_unlock(&root->fs_info->delalloc_lock);
4689                 if (inode) {
4690                         filemap_flush(inode->i_mapping);
4691                         iput(inode);
4692                 }
4693                 cond_resched();
4694                 spin_lock(&root->fs_info->delalloc_lock);
4695         }
4696         spin_unlock(&root->fs_info->delalloc_lock);
4697
4698         /* the filemap_flush will queue IO into the worker threads, but
4699          * we have to make sure the IO is actually started and that
4700          * ordered extents get created before we return
4701          */
4702         atomic_inc(&root->fs_info->async_submit_draining);
4703         while (atomic_read(&root->fs_info->nr_async_submits) ||
4704               atomic_read(&root->fs_info->async_delalloc_pages)) {
4705                 wait_event(root->fs_info->async_submit_wait,
4706                    (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
4707                     atomic_read(&root->fs_info->async_delalloc_pages) == 0));
4708         }
4709         atomic_dec(&root->fs_info->async_submit_draining);
4710         return 0;
4711 }
4712
4713 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4714                          const char *symname)
4715 {
4716         struct btrfs_trans_handle *trans;
4717         struct btrfs_root *root = BTRFS_I(dir)->root;
4718         struct btrfs_path *path;
4719         struct btrfs_key key;
4720         struct inode *inode = NULL;
4721         int err;
4722         int drop_inode = 0;
4723         u64 objectid;
4724         u64 index = 0 ;
4725         int name_len;
4726         int datasize;
4727         unsigned long ptr;
4728         struct btrfs_file_extent_item *ei;
4729         struct extent_buffer *leaf;
4730         unsigned long nr = 0;
4731
4732         name_len = strlen(symname) + 1;
4733         if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
4734                 return -ENAMETOOLONG;
4735
4736         err = btrfs_check_free_space(root, 1, 0);
4737         if (err)
4738                 goto out_fail;
4739
4740         trans = btrfs_start_transaction(root, 1);
4741         btrfs_set_trans_block_group(trans, dir);
4742
4743         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4744         if (err) {
4745                 err = -ENOSPC;
4746                 goto out_unlock;
4747         }
4748
4749         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4750                                 dentry->d_name.len,
4751                                 dentry->d_parent->d_inode->i_ino, objectid,
4752                                 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
4753                                 &index);
4754         err = PTR_ERR(inode);
4755         if (IS_ERR(inode))
4756                 goto out_unlock;
4757
4758         err = btrfs_init_inode_security(inode, dir);
4759         if (err) {
4760                 drop_inode = 1;
4761                 goto out_unlock;
4762         }
4763
4764         btrfs_set_trans_block_group(trans, inode);
4765         err = btrfs_add_nondir(trans, dentry, inode, 0, index);
4766         if (err)
4767                 drop_inode = 1;
4768         else {
4769                 inode->i_mapping->a_ops = &btrfs_aops;
4770                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4771                 inode->i_fop = &btrfs_file_operations;
4772                 inode->i_op = &btrfs_file_inode_operations;
4773                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4774         }
4775         dir->i_sb->s_dirt = 1;
4776         btrfs_update_inode_block_group(trans, inode);
4777         btrfs_update_inode_block_group(trans, dir);
4778         if (drop_inode)
4779                 goto out_unlock;
4780
4781         path = btrfs_alloc_path();
4782         BUG_ON(!path);
4783         key.objectid = inode->i_ino;
4784         key.offset = 0;
4785         btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
4786         datasize = btrfs_file_extent_calc_inline_size(name_len);
4787         err = btrfs_insert_empty_item(trans, root, path, &key,
4788                                       datasize);
4789         if (err) {
4790                 drop_inode = 1;
4791                 goto out_unlock;
4792         }
4793         leaf = path->nodes[0];
4794         ei = btrfs_item_ptr(leaf, path->slots[0],
4795                             struct btrfs_file_extent_item);
4796         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
4797         btrfs_set_file_extent_type(leaf, ei,
4798                                    BTRFS_FILE_EXTENT_INLINE);
4799         btrfs_set_file_extent_encryption(leaf, ei, 0);
4800         btrfs_set_file_extent_compression(leaf, ei, 0);
4801         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
4802         btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
4803
4804         ptr = btrfs_file_extent_inline_start(ei);
4805         write_extent_buffer(leaf, symname, ptr, name_len);
4806         btrfs_mark_buffer_dirty(leaf);
4807         btrfs_free_path(path);
4808
4809         inode->i_op = &btrfs_symlink_inode_operations;
4810         inode->i_mapping->a_ops = &btrfs_symlink_aops;
4811         inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4812         inode_set_bytes(inode, name_len);
4813         btrfs_i_size_write(inode, name_len - 1);
4814         err = btrfs_update_inode(trans, root, inode);
4815         if (err)
4816                 drop_inode = 1;
4817
4818 out_unlock:
4819         nr = trans->blocks_used;
4820         btrfs_end_transaction_throttle(trans, root);
4821 out_fail:
4822         if (drop_inode) {
4823                 inode_dec_link_count(inode);
4824                 iput(inode);
4825         }
4826         btrfs_btree_balance_dirty(root, nr);
4827         return err;
4828 }
4829
4830 static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
4831                                u64 alloc_hint, int mode)
4832 {
4833         struct btrfs_trans_handle *trans;
4834         struct btrfs_root *root = BTRFS_I(inode)->root;
4835         struct btrfs_key ins;
4836         u64 alloc_size;
4837         u64 cur_offset = start;
4838         u64 num_bytes = end - start;
4839         int ret = 0;
4840
4841         trans = btrfs_join_transaction(root, 1);
4842         BUG_ON(!trans);
4843         btrfs_set_trans_block_group(trans, inode);
4844
4845         while (num_bytes > 0) {
4846                 alloc_size = min(num_bytes, root->fs_info->max_extent);
4847                 ret = btrfs_reserve_extent(trans, root, alloc_size,
4848                                            root->sectorsize, 0, alloc_hint,
4849                                            (u64)-1, &ins, 1);
4850                 if (ret) {
4851                         WARN_ON(1);
4852                         goto out;
4853                 }
4854                 ret = insert_reserved_file_extent(trans, inode,
4855                                                   cur_offset, ins.objectid,
4856                                                   ins.offset, ins.offset,
4857                                                   ins.offset, 0, 0, 0,
4858                                                   BTRFS_FILE_EXTENT_PREALLOC);
4859                 BUG_ON(ret);
4860                 num_bytes -= ins.offset;
4861                 cur_offset += ins.offset;
4862                 alloc_hint = ins.objectid + ins.offset;
4863         }
4864 out:
4865         if (cur_offset > start) {
4866                 inode->i_ctime = CURRENT_TIME;
4867                 btrfs_set_flag(inode, PREALLOC);
4868                 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4869                     cur_offset > i_size_read(inode))
4870                         btrfs_i_size_write(inode, cur_offset);
4871                 ret = btrfs_update_inode(trans, root, inode);
4872                 BUG_ON(ret);
4873         }
4874
4875         btrfs_end_transaction(trans, root);
4876         return ret;
4877 }
4878
4879 static long btrfs_fallocate(struct inode *inode, int mode,
4880                             loff_t offset, loff_t len)
4881 {
4882         u64 cur_offset;
4883         u64 last_byte;
4884         u64 alloc_start;
4885         u64 alloc_end;
4886         u64 alloc_hint = 0;
4887         u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
4888         struct extent_map *em;
4889         int ret;
4890
4891         alloc_start = offset & ~mask;
4892         alloc_end =  (offset + len + mask) & ~mask;
4893
4894         mutex_lock(&inode->i_mutex);
4895         if (alloc_start > inode->i_size) {
4896                 ret = btrfs_cont_expand(inode, alloc_start);
4897                 if (ret)
4898                         goto out;
4899         }
4900
4901         while (1) {
4902                 struct btrfs_ordered_extent *ordered;
4903                 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
4904                             alloc_end - 1, GFP_NOFS);
4905                 ordered = btrfs_lookup_first_ordered_extent(inode,
4906                                                             alloc_end - 1);
4907                 if (ordered &&
4908                     ordered->file_offset + ordered->len > alloc_start &&
4909                     ordered->file_offset < alloc_end) {
4910                         btrfs_put_ordered_extent(ordered);
4911                         unlock_extent(&BTRFS_I(inode)->io_tree,
4912                                       alloc_start, alloc_end - 1, GFP_NOFS);
4913                         btrfs_wait_ordered_range(inode, alloc_start,
4914                                                  alloc_end - alloc_start);
4915                 } else {
4916                         if (ordered)
4917                                 btrfs_put_ordered_extent(ordered);
4918                         break;
4919                 }
4920         }
4921
4922         cur_offset = alloc_start;
4923         while (1) {
4924                 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4925                                       alloc_end - cur_offset, 0);
4926                 BUG_ON(IS_ERR(em) || !em);
4927                 last_byte = min(extent_map_end(em), alloc_end);
4928                 last_byte = (last_byte + mask) & ~mask;
4929                 if (em->block_start == EXTENT_MAP_HOLE) {
4930                         ret = prealloc_file_range(inode, cur_offset,
4931                                         last_byte, alloc_hint, mode);
4932                         if (ret < 0) {
4933                                 free_extent_map(em);
4934                                 break;
4935                         }
4936                 }
4937                 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
4938                         alloc_hint = em->block_start;
4939                 free_extent_map(em);
4940
4941                 cur_offset = last_byte;
4942                 if (cur_offset >= alloc_end) {
4943                         ret = 0;
4944                         break;
4945                 }
4946         }
4947         unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
4948                       GFP_NOFS);
4949 out:
4950         mutex_unlock(&inode->i_mutex);
4951         return ret;
4952 }
4953
4954 static int btrfs_set_page_dirty(struct page *page)
4955 {
4956         return __set_page_dirty_nobuffers(page);
4957 }
4958
4959 static int btrfs_permission(struct inode *inode, int mask)
4960 {
4961         if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
4962                 return -EACCES;
4963         return generic_permission(inode, mask, btrfs_check_acl);
4964 }
4965
4966 static struct inode_operations btrfs_dir_inode_operations = {
4967         .getattr        = btrfs_getattr,
4968         .lookup         = btrfs_lookup,
4969         .create         = btrfs_create,
4970         .unlink         = btrfs_unlink,
4971         .link           = btrfs_link,
4972         .mkdir          = btrfs_mkdir,
4973         .rmdir          = btrfs_rmdir,
4974         .rename         = btrfs_rename,
4975         .symlink        = btrfs_symlink,
4976         .setattr        = btrfs_setattr,
4977         .mknod          = btrfs_mknod,
4978         .setxattr       = btrfs_setxattr,
4979         .getxattr       = btrfs_getxattr,
4980         .listxattr      = btrfs_listxattr,
4981         .removexattr    = btrfs_removexattr,
4982         .permission     = btrfs_permission,
4983 };
4984 static struct inode_operations btrfs_dir_ro_inode_operations = {
4985         .lookup         = btrfs_lookup,
4986         .permission     = btrfs_permission,
4987 };
4988 static struct file_operations btrfs_dir_file_operations = {
4989         .llseek         = generic_file_llseek,
4990         .read           = generic_read_dir,
4991         .readdir        = btrfs_real_readdir,
4992         .unlocked_ioctl = btrfs_ioctl,
4993 #ifdef CONFIG_COMPAT
4994         .compat_ioctl   = btrfs_ioctl,
4995 #endif
4996         .release        = btrfs_release_file,
4997         .fsync          = btrfs_sync_file,
4998 };
4999
5000 static struct extent_io_ops btrfs_extent_io_ops = {
5001         .fill_delalloc = run_delalloc_range,
5002         .submit_bio_hook = btrfs_submit_bio_hook,
5003         .merge_bio_hook = btrfs_merge_bio_hook,
5004         .readpage_end_io_hook = btrfs_readpage_end_io_hook,
5005         .writepage_end_io_hook = btrfs_writepage_end_io_hook,
5006         .writepage_start_hook = btrfs_writepage_start_hook,
5007         .readpage_io_failed_hook = btrfs_io_failed_hook,
5008         .set_bit_hook = btrfs_set_bit_hook,
5009         .clear_bit_hook = btrfs_clear_bit_hook,
5010 };
5011
5012 /*
5013  * btrfs doesn't support the bmap operation because swapfiles
5014  * use bmap to make a mapping of extents in the file.  They assume
5015  * these extents won't change over the life of the file and they
5016  * use the bmap result to do IO directly to the drive.
5017  *
5018  * the btrfs bmap call would return logical addresses that aren't
5019  * suitable for IO and they also will change frequently as COW
5020  * operations happen.  So, swapfile + btrfs == corruption.
5021  *
5022  * For now we're avoiding this by dropping bmap.
5023  */
5024 static struct address_space_operations btrfs_aops = {
5025         .readpage       = btrfs_readpage,
5026         .writepage      = btrfs_writepage,
5027         .writepages     = btrfs_writepages,
5028         .readpages      = btrfs_readpages,
5029         .sync_page      = block_sync_page,
5030         .direct_IO      = btrfs_direct_IO,
5031         .invalidatepage = btrfs_invalidatepage,
5032         .releasepage    = btrfs_releasepage,
5033         .set_page_dirty = btrfs_set_page_dirty,
5034 };
5035
5036 static struct address_space_operations btrfs_symlink_aops = {
5037         .readpage       = btrfs_readpage,
5038         .writepage      = btrfs_writepage,
5039         .invalidatepage = btrfs_invalidatepage,
5040         .releasepage    = btrfs_releasepage,
5041 };
5042
5043 static struct inode_operations btrfs_file_inode_operations = {
5044         .truncate       = btrfs_truncate,
5045         .getattr        = btrfs_getattr,
5046         .setattr        = btrfs_setattr,
5047         .setxattr       = btrfs_setxattr,
5048         .getxattr       = btrfs_getxattr,
5049         .listxattr      = btrfs_listxattr,
5050         .removexattr    = btrfs_removexattr,
5051         .permission     = btrfs_permission,
5052         .fallocate      = btrfs_fallocate,
5053         .fiemap         = btrfs_fiemap,
5054 };
5055 static struct inode_operations btrfs_special_inode_operations = {
5056         .getattr        = btrfs_getattr,
5057         .setattr        = btrfs_setattr,
5058         .permission     = btrfs_permission,
5059         .setxattr       = btrfs_setxattr,
5060         .getxattr       = btrfs_getxattr,
5061         .listxattr      = btrfs_listxattr,
5062         .removexattr    = btrfs_removexattr,
5063 };
5064 static struct inode_operations btrfs_symlink_inode_operations = {
5065         .readlink       = generic_readlink,
5066         .follow_link    = page_follow_link_light,
5067         .put_link       = page_put_link,
5068         .permission     = btrfs_permission,
5069         .setxattr       = btrfs_setxattr,
5070         .getxattr       = btrfs_getxattr,
5071         .listxattr      = btrfs_listxattr,
5072         .removexattr    = btrfs_removexattr,
5073 };