]> Pileus Git - ~andy/linux/blob - fs/btrfs/extent_io.c
Btrfs: Compression corner fixes
[~andy/linux] / fs / btrfs / extent_io.c
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include <linux/swap.h>
12 #include <linux/version.h>
13 #include <linux/writeback.h>
14 #include <linux/pagevec.h>
15 #include "extent_io.h"
16 #include "extent_map.h"
17 #include "compat.h"
18 #include "ctree.h"
19 #include "btrfs_inode.h"
20
21 /* temporary define until extent_map moves out of btrfs */
22 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
23                                        unsigned long extra_flags,
24                                        void (*ctor)(void *, struct kmem_cache *,
25                                                     unsigned long));
26
27 static struct kmem_cache *extent_state_cache;
28 static struct kmem_cache *extent_buffer_cache;
29
30 static LIST_HEAD(buffers);
31 static LIST_HEAD(states);
32
33 #define LEAK_DEBUG 1
34 #ifdef LEAK_DEBUG
35 static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
36 #endif
37
38 #define BUFFER_LRU_MAX 64
39
40 struct tree_entry {
41         u64 start;
42         u64 end;
43         struct rb_node rb_node;
44 };
45
46 struct extent_page_data {
47         struct bio *bio;
48         struct extent_io_tree *tree;
49         get_extent_t *get_extent;
50 };
51
52 int __init extent_io_init(void)
53 {
54         extent_state_cache = btrfs_cache_create("extent_state",
55                                             sizeof(struct extent_state), 0,
56                                             NULL);
57         if (!extent_state_cache)
58                 return -ENOMEM;
59
60         extent_buffer_cache = btrfs_cache_create("extent_buffers",
61                                             sizeof(struct extent_buffer), 0,
62                                             NULL);
63         if (!extent_buffer_cache)
64                 goto free_state_cache;
65         return 0;
66
67 free_state_cache:
68         kmem_cache_destroy(extent_state_cache);
69         return -ENOMEM;
70 }
71
72 void extent_io_exit(void)
73 {
74         struct extent_state *state;
75         struct extent_buffer *eb;
76
77         while (!list_empty(&states)) {
78                 state = list_entry(states.next, struct extent_state, leak_list);
79                 printk("state leak: start %Lu end %Lu state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs));
80                 list_del(&state->leak_list);
81                 kmem_cache_free(extent_state_cache, state);
82
83         }
84
85         while (!list_empty(&buffers)) {
86                 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
87                 printk("buffer leak start %Lu len %lu refs %d\n", eb->start, eb->len, atomic_read(&eb->refs));
88                 list_del(&eb->leak_list);
89                 kmem_cache_free(extent_buffer_cache, eb);
90         }
91         if (extent_state_cache)
92                 kmem_cache_destroy(extent_state_cache);
93         if (extent_buffer_cache)
94                 kmem_cache_destroy(extent_buffer_cache);
95 }
96
97 void extent_io_tree_init(struct extent_io_tree *tree,
98                           struct address_space *mapping, gfp_t mask)
99 {
100         tree->state.rb_node = NULL;
101         tree->buffer.rb_node = NULL;
102         tree->ops = NULL;
103         tree->dirty_bytes = 0;
104         spin_lock_init(&tree->lock);
105         spin_lock_init(&tree->buffer_lock);
106         tree->mapping = mapping;
107 }
108 EXPORT_SYMBOL(extent_io_tree_init);
109
110 struct extent_state *alloc_extent_state(gfp_t mask)
111 {
112         struct extent_state *state;
113 #ifdef LEAK_DEBUG
114         unsigned long flags;
115 #endif
116
117         state = kmem_cache_alloc(extent_state_cache, mask);
118         if (!state)
119                 return state;
120         state->state = 0;
121         state->private = 0;
122         state->tree = NULL;
123 #ifdef LEAK_DEBUG
124         spin_lock_irqsave(&leak_lock, flags);
125         list_add(&state->leak_list, &states);
126         spin_unlock_irqrestore(&leak_lock, flags);
127 #endif
128         atomic_set(&state->refs, 1);
129         init_waitqueue_head(&state->wq);
130         return state;
131 }
132 EXPORT_SYMBOL(alloc_extent_state);
133
134 void free_extent_state(struct extent_state *state)
135 {
136         if (!state)
137                 return;
138         if (atomic_dec_and_test(&state->refs)) {
139 #ifdef LEAK_DEBUG
140                 unsigned long flags;
141 #endif
142                 WARN_ON(state->tree);
143 #ifdef LEAK_DEBUG
144                 spin_lock_irqsave(&leak_lock, flags);
145                 list_del(&state->leak_list);
146                 spin_unlock_irqrestore(&leak_lock, flags);
147 #endif
148                 kmem_cache_free(extent_state_cache, state);
149         }
150 }
151 EXPORT_SYMBOL(free_extent_state);
152
153 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
154                                    struct rb_node *node)
155 {
156         struct rb_node ** p = &root->rb_node;
157         struct rb_node * parent = NULL;
158         struct tree_entry *entry;
159
160         while(*p) {
161                 parent = *p;
162                 entry = rb_entry(parent, struct tree_entry, rb_node);
163
164                 if (offset < entry->start)
165                         p = &(*p)->rb_left;
166                 else if (offset > entry->end)
167                         p = &(*p)->rb_right;
168                 else
169                         return parent;
170         }
171
172         entry = rb_entry(node, struct tree_entry, rb_node);
173         rb_link_node(node, parent, p);
174         rb_insert_color(node, root);
175         return NULL;
176 }
177
178 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
179                                      struct rb_node **prev_ret,
180                                      struct rb_node **next_ret)
181 {
182         struct rb_root *root = &tree->state;
183         struct rb_node * n = root->rb_node;
184         struct rb_node *prev = NULL;
185         struct rb_node *orig_prev = NULL;
186         struct tree_entry *entry;
187         struct tree_entry *prev_entry = NULL;
188
189         while(n) {
190                 entry = rb_entry(n, struct tree_entry, rb_node);
191                 prev = n;
192                 prev_entry = entry;
193
194                 if (offset < entry->start)
195                         n = n->rb_left;
196                 else if (offset > entry->end)
197                         n = n->rb_right;
198                 else {
199                         return n;
200                 }
201         }
202
203         if (prev_ret) {
204                 orig_prev = prev;
205                 while(prev && offset > prev_entry->end) {
206                         prev = rb_next(prev);
207                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
208                 }
209                 *prev_ret = prev;
210                 prev = orig_prev;
211         }
212
213         if (next_ret) {
214                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
215                 while(prev && offset < prev_entry->start) {
216                         prev = rb_prev(prev);
217                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
218                 }
219                 *next_ret = prev;
220         }
221         return NULL;
222 }
223
224 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
225                                           u64 offset)
226 {
227         struct rb_node *prev = NULL;
228         struct rb_node *ret;
229
230         ret = __etree_search(tree, offset, &prev, NULL);
231         if (!ret) {
232                 return prev;
233         }
234         return ret;
235 }
236
237 static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
238                                           u64 offset, struct rb_node *node)
239 {
240         struct rb_root *root = &tree->buffer;
241         struct rb_node ** p = &root->rb_node;
242         struct rb_node * parent = NULL;
243         struct extent_buffer *eb;
244
245         while(*p) {
246                 parent = *p;
247                 eb = rb_entry(parent, struct extent_buffer, rb_node);
248
249                 if (offset < eb->start)
250                         p = &(*p)->rb_left;
251                 else if (offset > eb->start)
252                         p = &(*p)->rb_right;
253                 else
254                         return eb;
255         }
256
257         rb_link_node(node, parent, p);
258         rb_insert_color(node, root);
259         return NULL;
260 }
261
262 static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
263                                            u64 offset)
264 {
265         struct rb_root *root = &tree->buffer;
266         struct rb_node * n = root->rb_node;
267         struct extent_buffer *eb;
268
269         while(n) {
270                 eb = rb_entry(n, struct extent_buffer, rb_node);
271                 if (offset < eb->start)
272                         n = n->rb_left;
273                 else if (offset > eb->start)
274                         n = n->rb_right;
275                 else
276                         return eb;
277         }
278         return NULL;
279 }
280
281 /*
282  * utility function to look for merge candidates inside a given range.
283  * Any extents with matching state are merged together into a single
284  * extent in the tree.  Extents with EXTENT_IO in their state field
285  * are not merged because the end_io handlers need to be able to do
286  * operations on them without sleeping (or doing allocations/splits).
287  *
288  * This should be called with the tree lock held.
289  */
290 static int merge_state(struct extent_io_tree *tree,
291                        struct extent_state *state)
292 {
293         struct extent_state *other;
294         struct rb_node *other_node;
295
296         if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
297                 return 0;
298
299         other_node = rb_prev(&state->rb_node);
300         if (other_node) {
301                 other = rb_entry(other_node, struct extent_state, rb_node);
302                 if (other->end == state->start - 1 &&
303                     other->state == state->state) {
304                         state->start = other->start;
305                         other->tree = NULL;
306                         rb_erase(&other->rb_node, &tree->state);
307                         free_extent_state(other);
308                 }
309         }
310         other_node = rb_next(&state->rb_node);
311         if (other_node) {
312                 other = rb_entry(other_node, struct extent_state, rb_node);
313                 if (other->start == state->end + 1 &&
314                     other->state == state->state) {
315                         other->start = state->start;
316                         state->tree = NULL;
317                         rb_erase(&state->rb_node, &tree->state);
318                         free_extent_state(state);
319                 }
320         }
321         return 0;
322 }
323
324 static void set_state_cb(struct extent_io_tree *tree,
325                          struct extent_state *state,
326                          unsigned long bits)
327 {
328         if (tree->ops && tree->ops->set_bit_hook) {
329                 tree->ops->set_bit_hook(tree->mapping->host, state->start,
330                                         state->end, state->state, bits);
331         }
332 }
333
334 static void clear_state_cb(struct extent_io_tree *tree,
335                            struct extent_state *state,
336                            unsigned long bits)
337 {
338         if (tree->ops && tree->ops->set_bit_hook) {
339                 tree->ops->clear_bit_hook(tree->mapping->host, state->start,
340                                           state->end, state->state, bits);
341         }
342 }
343
344 /*
345  * insert an extent_state struct into the tree.  'bits' are set on the
346  * struct before it is inserted.
347  *
348  * This may return -EEXIST if the extent is already there, in which case the
349  * state struct is freed.
350  *
351  * The tree lock is not taken internally.  This is a utility function and
352  * probably isn't what you want to call (see set/clear_extent_bit).
353  */
354 static int insert_state(struct extent_io_tree *tree,
355                         struct extent_state *state, u64 start, u64 end,
356                         int bits)
357 {
358         struct rb_node *node;
359
360         if (end < start) {
361                 printk("end < start %Lu %Lu\n", end, start);
362                 WARN_ON(1);
363         }
364         if (bits & EXTENT_DIRTY)
365                 tree->dirty_bytes += end - start + 1;
366         set_state_cb(tree, state, bits);
367         state->state |= bits;
368         state->start = start;
369         state->end = end;
370         node = tree_insert(&tree->state, end, &state->rb_node);
371         if (node) {
372                 struct extent_state *found;
373                 found = rb_entry(node, struct extent_state, rb_node);
374                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
375                 free_extent_state(state);
376                 return -EEXIST;
377         }
378         state->tree = tree;
379         merge_state(tree, state);
380         return 0;
381 }
382
383 /*
384  * split a given extent state struct in two, inserting the preallocated
385  * struct 'prealloc' as the newly created second half.  'split' indicates an
386  * offset inside 'orig' where it should be split.
387  *
388  * Before calling,
389  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
390  * are two extent state structs in the tree:
391  * prealloc: [orig->start, split - 1]
392  * orig: [ split, orig->end ]
393  *
394  * The tree locks are not taken by this function. They need to be held
395  * by the caller.
396  */
397 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
398                        struct extent_state *prealloc, u64 split)
399 {
400         struct rb_node *node;
401         prealloc->start = orig->start;
402         prealloc->end = split - 1;
403         prealloc->state = orig->state;
404         orig->start = split;
405
406         node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
407         if (node) {
408                 struct extent_state *found;
409                 found = rb_entry(node, struct extent_state, rb_node);
410                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
411                 free_extent_state(prealloc);
412                 return -EEXIST;
413         }
414         prealloc->tree = tree;
415         return 0;
416 }
417
418 /*
419  * utility function to clear some bits in an extent state struct.
420  * it will optionally wake up any one waiting on this state (wake == 1), or
421  * forcibly remove the state from the tree (delete == 1).
422  *
423  * If no bits are set on the state struct after clearing things, the
424  * struct is freed and removed from the tree
425  */
426 static int clear_state_bit(struct extent_io_tree *tree,
427                             struct extent_state *state, int bits, int wake,
428                             int delete)
429 {
430         int ret = state->state & bits;
431
432         if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
433                 u64 range = state->end - state->start + 1;
434                 WARN_ON(range > tree->dirty_bytes);
435                 tree->dirty_bytes -= range;
436         }
437         clear_state_cb(tree, state, bits);
438         state->state &= ~bits;
439         if (wake)
440                 wake_up(&state->wq);
441         if (delete || state->state == 0) {
442                 if (state->tree) {
443                         clear_state_cb(tree, state, state->state);
444                         rb_erase(&state->rb_node, &tree->state);
445                         state->tree = NULL;
446                         free_extent_state(state);
447                 } else {
448                         WARN_ON(1);
449                 }
450         } else {
451                 merge_state(tree, state);
452         }
453         return ret;
454 }
455
456 /*
457  * clear some bits on a range in the tree.  This may require splitting
458  * or inserting elements in the tree, so the gfp mask is used to
459  * indicate which allocations or sleeping are allowed.
460  *
461  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
462  * the given range from the tree regardless of state (ie for truncate).
463  *
464  * the range [start, end] is inclusive.
465  *
466  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
467  * bits were already set, or zero if none of the bits were already set.
468  */
469 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
470                      int bits, int wake, int delete, gfp_t mask)
471 {
472         struct extent_state *state;
473         struct extent_state *prealloc = NULL;
474         struct rb_node *node;
475         unsigned long flags;
476         int err;
477         int set = 0;
478
479 again:
480         if (!prealloc && (mask & __GFP_WAIT)) {
481                 prealloc = alloc_extent_state(mask);
482                 if (!prealloc)
483                         return -ENOMEM;
484         }
485
486         spin_lock_irqsave(&tree->lock, flags);
487         /*
488          * this search will find the extents that end after
489          * our range starts
490          */
491         node = tree_search(tree, start);
492         if (!node)
493                 goto out;
494         state = rb_entry(node, struct extent_state, rb_node);
495         if (state->start > end)
496                 goto out;
497         WARN_ON(state->end < start);
498
499         /*
500          *     | ---- desired range ---- |
501          *  | state | or
502          *  | ------------- state -------------- |
503          *
504          * We need to split the extent we found, and may flip
505          * bits on second half.
506          *
507          * If the extent we found extends past our range, we
508          * just split and search again.  It'll get split again
509          * the next time though.
510          *
511          * If the extent we found is inside our range, we clear
512          * the desired bit on it.
513          */
514
515         if (state->start < start) {
516                 if (!prealloc)
517                         prealloc = alloc_extent_state(GFP_ATOMIC);
518                 err = split_state(tree, state, prealloc, start);
519                 BUG_ON(err == -EEXIST);
520                 prealloc = NULL;
521                 if (err)
522                         goto out;
523                 if (state->end <= end) {
524                         start = state->end + 1;
525                         set |= clear_state_bit(tree, state, bits,
526                                         wake, delete);
527                 } else {
528                         start = state->start;
529                 }
530                 goto search_again;
531         }
532         /*
533          * | ---- desired range ---- |
534          *                        | state |
535          * We need to split the extent, and clear the bit
536          * on the first half
537          */
538         if (state->start <= end && state->end > end) {
539                 if (!prealloc)
540                         prealloc = alloc_extent_state(GFP_ATOMIC);
541                 err = split_state(tree, state, prealloc, end + 1);
542                 BUG_ON(err == -EEXIST);
543
544                 if (wake)
545                         wake_up(&state->wq);
546                 set |= clear_state_bit(tree, prealloc, bits,
547                                        wake, delete);
548                 prealloc = NULL;
549                 goto out;
550         }
551
552         start = state->end + 1;
553         set |= clear_state_bit(tree, state, bits, wake, delete);
554         goto search_again;
555
556 out:
557         spin_unlock_irqrestore(&tree->lock, flags);
558         if (prealloc)
559                 free_extent_state(prealloc);
560
561         return set;
562
563 search_again:
564         if (start > end)
565                 goto out;
566         spin_unlock_irqrestore(&tree->lock, flags);
567         if (mask & __GFP_WAIT)
568                 cond_resched();
569         goto again;
570 }
571 EXPORT_SYMBOL(clear_extent_bit);
572
573 static int wait_on_state(struct extent_io_tree *tree,
574                          struct extent_state *state)
575 {
576         DEFINE_WAIT(wait);
577         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
578         spin_unlock_irq(&tree->lock);
579         schedule();
580         spin_lock_irq(&tree->lock);
581         finish_wait(&state->wq, &wait);
582         return 0;
583 }
584
585 /*
586  * waits for one or more bits to clear on a range in the state tree.
587  * The range [start, end] is inclusive.
588  * The tree lock is taken by this function
589  */
590 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
591 {
592         struct extent_state *state;
593         struct rb_node *node;
594
595         spin_lock_irq(&tree->lock);
596 again:
597         while (1) {
598                 /*
599                  * this search will find all the extents that end after
600                  * our range starts
601                  */
602                 node = tree_search(tree, start);
603                 if (!node)
604                         break;
605
606                 state = rb_entry(node, struct extent_state, rb_node);
607
608                 if (state->start > end)
609                         goto out;
610
611                 if (state->state & bits) {
612                         start = state->start;
613                         atomic_inc(&state->refs);
614                         wait_on_state(tree, state);
615                         free_extent_state(state);
616                         goto again;
617                 }
618                 start = state->end + 1;
619
620                 if (start > end)
621                         break;
622
623                 if (need_resched()) {
624                         spin_unlock_irq(&tree->lock);
625                         cond_resched();
626                         spin_lock_irq(&tree->lock);
627                 }
628         }
629 out:
630         spin_unlock_irq(&tree->lock);
631         return 0;
632 }
633 EXPORT_SYMBOL(wait_extent_bit);
634
635 static void set_state_bits(struct extent_io_tree *tree,
636                            struct extent_state *state,
637                            int bits)
638 {
639         if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
640                 u64 range = state->end - state->start + 1;
641                 tree->dirty_bytes += range;
642         }
643         set_state_cb(tree, state, bits);
644         state->state |= bits;
645 }
646
647 /*
648  * set some bits on a range in the tree.  This may require allocations
649  * or sleeping, so the gfp mask is used to indicate what is allowed.
650  *
651  * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
652  * range already has the desired bits set.  The start of the existing
653  * range is returned in failed_start in this case.
654  *
655  * [start, end] is inclusive
656  * This takes the tree lock.
657  */
658 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
659                    int exclusive, u64 *failed_start, gfp_t mask)
660 {
661         struct extent_state *state;
662         struct extent_state *prealloc = NULL;
663         struct rb_node *node;
664         unsigned long flags;
665         int err = 0;
666         int set;
667         u64 last_start;
668         u64 last_end;
669 again:
670         if (!prealloc && (mask & __GFP_WAIT)) {
671                 prealloc = alloc_extent_state(mask);
672                 if (!prealloc)
673                         return -ENOMEM;
674         }
675
676         spin_lock_irqsave(&tree->lock, flags);
677         /*
678          * this search will find all the extents that end after
679          * our range starts.
680          */
681         node = tree_search(tree, start);
682         if (!node) {
683                 err = insert_state(tree, prealloc, start, end, bits);
684                 prealloc = NULL;
685                 BUG_ON(err == -EEXIST);
686                 goto out;
687         }
688
689         state = rb_entry(node, struct extent_state, rb_node);
690         last_start = state->start;
691         last_end = state->end;
692
693         /*
694          * | ---- desired range ---- |
695          * | state |
696          *
697          * Just lock what we found and keep going
698          */
699         if (state->start == start && state->end <= end) {
700                 set = state->state & bits;
701                 if (set && exclusive) {
702                         *failed_start = state->start;
703                         err = -EEXIST;
704                         goto out;
705                 }
706                 set_state_bits(tree, state, bits);
707                 start = state->end + 1;
708                 merge_state(tree, state);
709                 goto search_again;
710         }
711
712         /*
713          *     | ---- desired range ---- |
714          * | state |
715          *   or
716          * | ------------- state -------------- |
717          *
718          * We need to split the extent we found, and may flip bits on
719          * second half.
720          *
721          * If the extent we found extends past our
722          * range, we just split and search again.  It'll get split
723          * again the next time though.
724          *
725          * If the extent we found is inside our range, we set the
726          * desired bit on it.
727          */
728         if (state->start < start) {
729                 set = state->state & bits;
730                 if (exclusive && set) {
731                         *failed_start = start;
732                         err = -EEXIST;
733                         goto out;
734                 }
735                 err = split_state(tree, state, prealloc, start);
736                 BUG_ON(err == -EEXIST);
737                 prealloc = NULL;
738                 if (err)
739                         goto out;
740                 if (state->end <= end) {
741                         set_state_bits(tree, state, bits);
742                         start = state->end + 1;
743                         merge_state(tree, state);
744                 } else {
745                         start = state->start;
746                 }
747                 goto search_again;
748         }
749         /*
750          * | ---- desired range ---- |
751          *     | state | or               | state |
752          *
753          * There's a hole, we need to insert something in it and
754          * ignore the extent we found.
755          */
756         if (state->start > start) {
757                 u64 this_end;
758                 if (end < last_start)
759                         this_end = end;
760                 else
761                         this_end = last_start -1;
762                 err = insert_state(tree, prealloc, start, this_end,
763                                    bits);
764                 prealloc = NULL;
765                 BUG_ON(err == -EEXIST);
766                 if (err)
767                         goto out;
768                 start = this_end + 1;
769                 goto search_again;
770         }
771         /*
772          * | ---- desired range ---- |
773          *                        | state |
774          * We need to split the extent, and set the bit
775          * on the first half
776          */
777         if (state->start <= end && state->end > end) {
778                 set = state->state & bits;
779                 if (exclusive && set) {
780                         *failed_start = start;
781                         err = -EEXIST;
782                         goto out;
783                 }
784                 err = split_state(tree, state, prealloc, end + 1);
785                 BUG_ON(err == -EEXIST);
786
787                 set_state_bits(tree, prealloc, bits);
788                 merge_state(tree, prealloc);
789                 prealloc = NULL;
790                 goto out;
791         }
792
793         goto search_again;
794
795 out:
796         spin_unlock_irqrestore(&tree->lock, flags);
797         if (prealloc)
798                 free_extent_state(prealloc);
799
800         return err;
801
802 search_again:
803         if (start > end)
804                 goto out;
805         spin_unlock_irqrestore(&tree->lock, flags);
806         if (mask & __GFP_WAIT)
807                 cond_resched();
808         goto again;
809 }
810 EXPORT_SYMBOL(set_extent_bit);
811
812 /* wrappers around set/clear extent bit */
813 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
814                      gfp_t mask)
815 {
816         return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
817                               mask);
818 }
819 EXPORT_SYMBOL(set_extent_dirty);
820
821 int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
822                        gfp_t mask)
823 {
824         return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
825 }
826 EXPORT_SYMBOL(set_extent_ordered);
827
828 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
829                     int bits, gfp_t mask)
830 {
831         return set_extent_bit(tree, start, end, bits, 0, NULL,
832                               mask);
833 }
834 EXPORT_SYMBOL(set_extent_bits);
835
836 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
837                       int bits, gfp_t mask)
838 {
839         return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
840 }
841 EXPORT_SYMBOL(clear_extent_bits);
842
843 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
844                      gfp_t mask)
845 {
846         return set_extent_bit(tree, start, end,
847                               EXTENT_DELALLOC | EXTENT_DIRTY,
848                               0, NULL, mask);
849 }
850 EXPORT_SYMBOL(set_extent_delalloc);
851
852 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
853                        gfp_t mask)
854 {
855         return clear_extent_bit(tree, start, end,
856                                 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
857 }
858 EXPORT_SYMBOL(clear_extent_dirty);
859
860 int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
861                          gfp_t mask)
862 {
863         return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
864 }
865 EXPORT_SYMBOL(clear_extent_ordered);
866
867 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
868                      gfp_t mask)
869 {
870         return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
871                               mask);
872 }
873 EXPORT_SYMBOL(set_extent_new);
874
875 int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
876                        gfp_t mask)
877 {
878         return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
879 }
880 EXPORT_SYMBOL(clear_extent_new);
881
882 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
883                         gfp_t mask)
884 {
885         return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
886                               mask);
887 }
888 EXPORT_SYMBOL(set_extent_uptodate);
889
890 int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
891                           gfp_t mask)
892 {
893         return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
894 }
895 EXPORT_SYMBOL(clear_extent_uptodate);
896
897 int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
898                          gfp_t mask)
899 {
900         return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
901                               0, NULL, mask);
902 }
903 EXPORT_SYMBOL(set_extent_writeback);
904
905 int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
906                            gfp_t mask)
907 {
908         return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
909 }
910 EXPORT_SYMBOL(clear_extent_writeback);
911
912 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
913 {
914         return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
915 }
916 EXPORT_SYMBOL(wait_on_extent_writeback);
917
918 /*
919  * either insert or lock state struct between start and end use mask to tell
920  * us if waiting is desired.
921  */
922 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
923 {
924         int err;
925         u64 failed_start;
926         while (1) {
927                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
928                                      &failed_start, mask);
929                 if (err == -EEXIST && (mask & __GFP_WAIT)) {
930                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
931                         start = failed_start;
932                 } else {
933                         break;
934                 }
935                 WARN_ON(start > end);
936         }
937         return err;
938 }
939 EXPORT_SYMBOL(lock_extent);
940
941 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
942                     gfp_t mask)
943 {
944         int err;
945         u64 failed_start;
946
947         err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
948                              &failed_start, mask);
949         if (err == -EEXIST) {
950                 if (failed_start > start)
951                         clear_extent_bit(tree, start, failed_start - 1,
952                                          EXTENT_LOCKED, 1, 0, mask);
953                 return 0;
954         }
955         return 1;
956 }
957 EXPORT_SYMBOL(try_lock_extent);
958
959 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
960                   gfp_t mask)
961 {
962         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
963 }
964 EXPORT_SYMBOL(unlock_extent);
965
966 /*
967  * helper function to set pages and extents in the tree dirty
968  */
969 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
970 {
971         unsigned long index = start >> PAGE_CACHE_SHIFT;
972         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
973         struct page *page;
974
975         while (index <= end_index) {
976                 page = find_get_page(tree->mapping, index);
977                 BUG_ON(!page);
978                 __set_page_dirty_nobuffers(page);
979                 page_cache_release(page);
980                 index++;
981         }
982         set_extent_dirty(tree, start, end, GFP_NOFS);
983         return 0;
984 }
985 EXPORT_SYMBOL(set_range_dirty);
986
987 /*
988  * helper function to set both pages and extents in the tree writeback
989  */
990 int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
991 {
992         unsigned long index = start >> PAGE_CACHE_SHIFT;
993         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
994         struct page *page;
995
996         while (index <= end_index) {
997                 page = find_get_page(tree->mapping, index);
998                 BUG_ON(!page);
999                 set_page_writeback(page);
1000                 page_cache_release(page);
1001                 index++;
1002         }
1003         set_extent_writeback(tree, start, end, GFP_NOFS);
1004         return 0;
1005 }
1006 EXPORT_SYMBOL(set_range_writeback);
1007
1008 /*
1009  * find the first offset in the io tree with 'bits' set. zero is
1010  * returned if we find something, and *start_ret and *end_ret are
1011  * set to reflect the state struct that was found.
1012  *
1013  * If nothing was found, 1 is returned, < 0 on error
1014  */
1015 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1016                           u64 *start_ret, u64 *end_ret, int bits)
1017 {
1018         struct rb_node *node;
1019         struct extent_state *state;
1020         int ret = 1;
1021
1022         spin_lock_irq(&tree->lock);
1023         /*
1024          * this search will find all the extents that end after
1025          * our range starts.
1026          */
1027         node = tree_search(tree, start);
1028         if (!node) {
1029                 goto out;
1030         }
1031
1032         while(1) {
1033                 state = rb_entry(node, struct extent_state, rb_node);
1034                 if (state->end >= start && (state->state & bits)) {
1035                         *start_ret = state->start;
1036                         *end_ret = state->end;
1037                         ret = 0;
1038                         break;
1039                 }
1040                 node = rb_next(node);
1041                 if (!node)
1042                         break;
1043         }
1044 out:
1045         spin_unlock_irq(&tree->lock);
1046         return ret;
1047 }
1048 EXPORT_SYMBOL(find_first_extent_bit);
1049
1050 /* find the first state struct with 'bits' set after 'start', and
1051  * return it.  tree->lock must be held.  NULL will returned if
1052  * nothing was found after 'start'
1053  */
1054 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1055                                                  u64 start, int bits)
1056 {
1057         struct rb_node *node;
1058         struct extent_state *state;
1059
1060         /*
1061          * this search will find all the extents that end after
1062          * our range starts.
1063          */
1064         node = tree_search(tree, start);
1065         if (!node) {
1066                 goto out;
1067         }
1068
1069         while(1) {
1070                 state = rb_entry(node, struct extent_state, rb_node);
1071                 if (state->end >= start && (state->state & bits)) {
1072                         return state;
1073                 }
1074                 node = rb_next(node);
1075                 if (!node)
1076                         break;
1077         }
1078 out:
1079         return NULL;
1080 }
1081 EXPORT_SYMBOL(find_first_extent_bit_state);
1082
1083 /*
1084  * find a contiguous range of bytes in the file marked as delalloc, not
1085  * more than 'max_bytes'.  start and end are used to return the range,
1086  *
1087  * 1 is returned if we find something, 0 if nothing was in the tree
1088  */
1089 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1090                                         u64 *start, u64 *end, u64 max_bytes)
1091 {
1092         struct rb_node *node;
1093         struct extent_state *state;
1094         u64 cur_start = *start;
1095         u64 found = 0;
1096         u64 total_bytes = 0;
1097
1098         spin_lock_irq(&tree->lock);
1099
1100         /*
1101          * this search will find all the extents that end after
1102          * our range starts.
1103          */
1104         node = tree_search(tree, cur_start);
1105         if (!node) {
1106                 if (!found)
1107                         *end = (u64)-1;
1108                 goto out;
1109         }
1110
1111         while(1) {
1112                 state = rb_entry(node, struct extent_state, rb_node);
1113                 if (found && (state->start != cur_start ||
1114                               (state->state & EXTENT_BOUNDARY))) {
1115                         goto out;
1116                 }
1117                 if (!(state->state & EXTENT_DELALLOC)) {
1118                         if (!found)
1119                                 *end = state->end;
1120                         goto out;
1121                 }
1122                 if (!found)
1123                         *start = state->start;
1124                 found++;
1125                 *end = state->end;
1126                 cur_start = state->end + 1;
1127                 node = rb_next(node);
1128                 if (!node)
1129                         break;
1130                 total_bytes += state->end - state->start + 1;
1131                 if (total_bytes >= max_bytes)
1132                         break;
1133         }
1134 out:
1135         spin_unlock_irq(&tree->lock);
1136         return found;
1137 }
1138
1139 static noinline int __unlock_for_delalloc(struct inode *inode,
1140                                           struct page *locked_page,
1141                                           u64 start, u64 end)
1142 {
1143         int ret;
1144         struct page *pages[16];
1145         unsigned long index = start >> PAGE_CACHE_SHIFT;
1146         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1147         unsigned long nr_pages = end_index - index + 1;
1148         int i;
1149
1150         if (index == locked_page->index && end_index == index)
1151                 return 0;
1152
1153         while(nr_pages > 0) {
1154                 ret = find_get_pages_contig(inode->i_mapping, index,
1155                                      min(nr_pages, ARRAY_SIZE(pages)), pages);
1156                 for (i = 0; i < ret; i++) {
1157                         if (pages[i] != locked_page)
1158                                 unlock_page(pages[i]);
1159                         page_cache_release(pages[i]);
1160                 }
1161                 nr_pages -= ret;
1162                 index += ret;
1163                 cond_resched();
1164         }
1165         return 0;
1166 }
1167
1168 static noinline int lock_delalloc_pages(struct inode *inode,
1169                                         struct page *locked_page,
1170                                         u64 delalloc_start,
1171                                         u64 delalloc_end)
1172 {
1173         unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1174         unsigned long start_index = index;
1175         unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1176         unsigned long pages_locked = 0;
1177         struct page *pages[16];
1178         unsigned long nrpages;
1179         int ret;
1180         int i;
1181
1182         /* the caller is responsible for locking the start index */
1183         if (index == locked_page->index && index == end_index)
1184                 return 0;
1185
1186         /* skip the page at the start index */
1187         nrpages = end_index - index + 1;
1188         while(nrpages > 0) {
1189                 ret = find_get_pages_contig(inode->i_mapping, index,
1190                                      min(nrpages, ARRAY_SIZE(pages)), pages);
1191                 if (ret == 0) {
1192                         ret = -EAGAIN;
1193                         goto done;
1194                 }
1195                 /* now we have an array of pages, lock them all */
1196                 for (i = 0; i < ret; i++) {
1197                         /*
1198                          * the caller is taking responsibility for
1199                          * locked_page
1200                          */
1201                         if (pages[i] != locked_page)
1202                                 lock_page(pages[i]);
1203                         page_cache_release(pages[i]);
1204                 }
1205                 pages_locked += ret;
1206                 nrpages -= ret;
1207                 index += ret;
1208                 cond_resched();
1209         }
1210         ret = 0;
1211 done:
1212         if (ret && pages_locked) {
1213                 __unlock_for_delalloc(inode, locked_page,
1214                               delalloc_start,
1215                               ((u64)(start_index + pages_locked - 1)) <<
1216                               PAGE_CACHE_SHIFT);
1217         }
1218         return ret;
1219 }
1220
1221 /*
1222  * find a contiguous range of bytes in the file marked as delalloc, not
1223  * more than 'max_bytes'.  start and end are used to return the range,
1224  *
1225  * 1 is returned if we find something, 0 if nothing was in the tree
1226  */
1227 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1228                                              struct extent_io_tree *tree,
1229                                              struct page *locked_page,
1230                                              u64 *start, u64 *end,
1231                                              u64 max_bytes)
1232 {
1233         u64 delalloc_start;
1234         u64 delalloc_end;
1235         u64 found;
1236         int ret;
1237         int loops = 0;
1238
1239 again:
1240         /* step one, find a bunch of delalloc bytes starting at start */
1241         delalloc_start = *start;
1242         delalloc_end = 0;
1243         found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1244                                     max_bytes);
1245         if (!found || delalloc_end <= *start) {
1246                 *start = delalloc_start;
1247                 *end = delalloc_end;
1248                 return found;
1249         }
1250
1251         /*
1252          * start comes from the offset of locked_page.  We have to lock
1253          * pages in order, so we can't process delalloc bytes before
1254          * locked_page
1255          */
1256         if (delalloc_start < *start) {
1257                 delalloc_start = *start;
1258         }
1259
1260         /*
1261          * make sure to limit the number of pages we try to lock down
1262          * if we're looping.
1263          */
1264         if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
1265                 delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) &
1266                         ~((u64)PAGE_CACHE_SIZE - 1);
1267         }
1268         /* step two, lock all the pages after the page that has start */
1269         ret = lock_delalloc_pages(inode, locked_page,
1270                                   delalloc_start, delalloc_end);
1271         if (ret == -EAGAIN) {
1272                 /* some of the pages are gone, lets avoid looping by
1273                  * shortening the size of the delalloc range we're searching
1274                  */
1275                 if (!loops) {
1276                         unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1277                         max_bytes = PAGE_CACHE_SIZE - offset;
1278                         loops = 1;
1279                         goto again;
1280                 } else {
1281                         found = 0;
1282                         goto out_failed;
1283                 }
1284         }
1285         BUG_ON(ret);
1286
1287         /* step three, lock the state bits for the whole range */
1288         lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1289
1290         /* then test to make sure it is all still delalloc */
1291         ret = test_range_bit(tree, delalloc_start, delalloc_end,
1292                              EXTENT_DELALLOC, 1);
1293         if (!ret) {
1294                 unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1295                 __unlock_for_delalloc(inode, locked_page,
1296                               delalloc_start, delalloc_end);
1297                 cond_resched();
1298                 goto again;
1299         }
1300         *start = delalloc_start;
1301         *end = delalloc_end;
1302 out_failed:
1303         return found;
1304 }
1305
1306 int extent_clear_unlock_delalloc(struct inode *inode,
1307                                 struct extent_io_tree *tree,
1308                                 u64 start, u64 end, struct page *locked_page,
1309                                 int clear_dirty, int set_writeback,
1310                                 int end_writeback)
1311 {
1312         int ret;
1313         struct page *pages[16];
1314         unsigned long index = start >> PAGE_CACHE_SHIFT;
1315         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1316         unsigned long nr_pages = end_index - index + 1;
1317         int i;
1318         int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
1319
1320         if (clear_dirty)
1321                 clear_bits |= EXTENT_DIRTY;
1322
1323         clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
1324
1325         while(nr_pages > 0) {
1326                 ret = find_get_pages_contig(inode->i_mapping, index,
1327                                      min(nr_pages, ARRAY_SIZE(pages)), pages);
1328                 for (i = 0; i < ret; i++) {
1329                         if (pages[i] == locked_page) {
1330                                 page_cache_release(pages[i]);
1331                                 continue;
1332                         }
1333                         if (clear_dirty)
1334                                 clear_page_dirty_for_io(pages[i]);
1335                         if (set_writeback)
1336                                 set_page_writeback(pages[i]);
1337                         if (end_writeback)
1338                                 end_page_writeback(pages[i]);
1339                         unlock_page(pages[i]);
1340                         page_cache_release(pages[i]);
1341                 }
1342                 nr_pages -= ret;
1343                 index += ret;
1344                 cond_resched();
1345         }
1346         return 0;
1347 }
1348 EXPORT_SYMBOL(extent_clear_unlock_delalloc);
1349
1350 /*
1351  * count the number of bytes in the tree that have a given bit(s)
1352  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1353  * cached.  The total number found is returned.
1354  */
1355 u64 count_range_bits(struct extent_io_tree *tree,
1356                      u64 *start, u64 search_end, u64 max_bytes,
1357                      unsigned long bits)
1358 {
1359         struct rb_node *node;
1360         struct extent_state *state;
1361         u64 cur_start = *start;
1362         u64 total_bytes = 0;
1363         int found = 0;
1364
1365         if (search_end <= cur_start) {
1366                 printk("search_end %Lu start %Lu\n", search_end, cur_start);
1367                 WARN_ON(1);
1368                 return 0;
1369         }
1370
1371         spin_lock_irq(&tree->lock);
1372         if (cur_start == 0 && bits == EXTENT_DIRTY) {
1373                 total_bytes = tree->dirty_bytes;
1374                 goto out;
1375         }
1376         /*
1377          * this search will find all the extents that end after
1378          * our range starts.
1379          */
1380         node = tree_search(tree, cur_start);
1381         if (!node) {
1382                 goto out;
1383         }
1384
1385         while(1) {
1386                 state = rb_entry(node, struct extent_state, rb_node);
1387                 if (state->start > search_end)
1388                         break;
1389                 if (state->end >= cur_start && (state->state & bits)) {
1390                         total_bytes += min(search_end, state->end) + 1 -
1391                                        max(cur_start, state->start);
1392                         if (total_bytes >= max_bytes)
1393                                 break;
1394                         if (!found) {
1395                                 *start = state->start;
1396                                 found = 1;
1397                         }
1398                 }
1399                 node = rb_next(node);
1400                 if (!node)
1401                         break;
1402         }
1403 out:
1404         spin_unlock_irq(&tree->lock);
1405         return total_bytes;
1406 }
1407 /*
1408  * helper function to lock both pages and extents in the tree.
1409  * pages must be locked first.
1410  */
1411 int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1412 {
1413         unsigned long index = start >> PAGE_CACHE_SHIFT;
1414         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1415         struct page *page;
1416         int err;
1417
1418         while (index <= end_index) {
1419                 page = grab_cache_page(tree->mapping, index);
1420                 if (!page) {
1421                         err = -ENOMEM;
1422                         goto failed;
1423                 }
1424                 if (IS_ERR(page)) {
1425                         err = PTR_ERR(page);
1426                         goto failed;
1427                 }
1428                 index++;
1429         }
1430         lock_extent(tree, start, end, GFP_NOFS);
1431         return 0;
1432
1433 failed:
1434         /*
1435          * we failed above in getting the page at 'index', so we undo here
1436          * up to but not including the page at 'index'
1437          */
1438         end_index = index;
1439         index = start >> PAGE_CACHE_SHIFT;
1440         while (index < end_index) {
1441                 page = find_get_page(tree->mapping, index);
1442                 unlock_page(page);
1443                 page_cache_release(page);
1444                 index++;
1445         }
1446         return err;
1447 }
1448 EXPORT_SYMBOL(lock_range);
1449
1450 /*
1451  * helper function to unlock both pages and extents in the tree.
1452  */
1453 int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1454 {
1455         unsigned long index = start >> PAGE_CACHE_SHIFT;
1456         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1457         struct page *page;
1458
1459         while (index <= end_index) {
1460                 page = find_get_page(tree->mapping, index);
1461                 unlock_page(page);
1462                 page_cache_release(page);
1463                 index++;
1464         }
1465         unlock_extent(tree, start, end, GFP_NOFS);
1466         return 0;
1467 }
1468 EXPORT_SYMBOL(unlock_range);
1469
1470 /*
1471  * set the private field for a given byte offset in the tree.  If there isn't
1472  * an extent_state there already, this does nothing.
1473  */
1474 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1475 {
1476         struct rb_node *node;
1477         struct extent_state *state;
1478         int ret = 0;
1479
1480         spin_lock_irq(&tree->lock);
1481         /*
1482          * this search will find all the extents that end after
1483          * our range starts.
1484          */
1485         node = tree_search(tree, start);
1486         if (!node) {
1487                 ret = -ENOENT;
1488                 goto out;
1489         }
1490         state = rb_entry(node, struct extent_state, rb_node);
1491         if (state->start != start) {
1492                 ret = -ENOENT;
1493                 goto out;
1494         }
1495         state->private = private;
1496 out:
1497         spin_unlock_irq(&tree->lock);
1498         return ret;
1499 }
1500
1501 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1502 {
1503         struct rb_node *node;
1504         struct extent_state *state;
1505         int ret = 0;
1506
1507         spin_lock_irq(&tree->lock);
1508         /*
1509          * this search will find all the extents that end after
1510          * our range starts.
1511          */
1512         node = tree_search(tree, start);
1513         if (!node) {
1514                 ret = -ENOENT;
1515                 goto out;
1516         }
1517         state = rb_entry(node, struct extent_state, rb_node);
1518         if (state->start != start) {
1519                 ret = -ENOENT;
1520                 goto out;
1521         }
1522         *private = state->private;
1523 out:
1524         spin_unlock_irq(&tree->lock);
1525         return ret;
1526 }
1527
1528 /*
1529  * searches a range in the state tree for a given mask.
1530  * If 'filled' == 1, this returns 1 only if every extent in the tree
1531  * has the bits set.  Otherwise, 1 is returned if any bit in the
1532  * range is found set.
1533  */
1534 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1535                    int bits, int filled)
1536 {
1537         struct extent_state *state = NULL;
1538         struct rb_node *node;
1539         int bitset = 0;
1540         unsigned long flags;
1541
1542         spin_lock_irqsave(&tree->lock, flags);
1543         node = tree_search(tree, start);
1544         while (node && start <= end) {
1545                 state = rb_entry(node, struct extent_state, rb_node);
1546
1547                 if (filled && state->start > start) {
1548                         bitset = 0;
1549                         break;
1550                 }
1551
1552                 if (state->start > end)
1553                         break;
1554
1555                 if (state->state & bits) {
1556                         bitset = 1;
1557                         if (!filled)
1558                                 break;
1559                 } else if (filled) {
1560                         bitset = 0;
1561                         break;
1562                 }
1563                 start = state->end + 1;
1564                 if (start > end)
1565                         break;
1566                 node = rb_next(node);
1567                 if (!node) {
1568                         if (filled)
1569                                 bitset = 0;
1570                         break;
1571                 }
1572         }
1573         spin_unlock_irqrestore(&tree->lock, flags);
1574         return bitset;
1575 }
1576 EXPORT_SYMBOL(test_range_bit);
1577
1578 /*
1579  * helper function to set a given page up to date if all the
1580  * extents in the tree for that page are up to date
1581  */
1582 static int check_page_uptodate(struct extent_io_tree *tree,
1583                                struct page *page)
1584 {
1585         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1586         u64 end = start + PAGE_CACHE_SIZE - 1;
1587         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1588                 SetPageUptodate(page);
1589         return 0;
1590 }
1591
1592 /*
1593  * helper function to unlock a page if all the extents in the tree
1594  * for that page are unlocked
1595  */
1596 static int check_page_locked(struct extent_io_tree *tree,
1597                              struct page *page)
1598 {
1599         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1600         u64 end = start + PAGE_CACHE_SIZE - 1;
1601         if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1602                 unlock_page(page);
1603         return 0;
1604 }
1605
1606 /*
1607  * helper function to end page writeback if all the extents
1608  * in the tree for that page are done with writeback
1609  */
1610 static int check_page_writeback(struct extent_io_tree *tree,
1611                              struct page *page)
1612 {
1613         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1614         u64 end = start + PAGE_CACHE_SIZE - 1;
1615         if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1616                 end_page_writeback(page);
1617         return 0;
1618 }
1619
1620 /* lots and lots of room for performance fixes in the end_bio funcs */
1621
1622 /*
1623  * after a writepage IO is done, we need to:
1624  * clear the uptodate bits on error
1625  * clear the writeback bits in the extent tree for this IO
1626  * end_page_writeback if the page has no more pending IO
1627  *
1628  * Scheduling is not allowed, so the extent state tree is expected
1629  * to have one and only one object corresponding to this IO.
1630  */
1631 static void end_bio_extent_writepage(struct bio *bio, int err)
1632 {
1633         int uptodate = err == 0;
1634         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1635         struct extent_io_tree *tree;
1636         u64 start;
1637         u64 end;
1638         int whole_page;
1639         int ret;
1640
1641         do {
1642                 struct page *page = bvec->bv_page;
1643                 tree = &BTRFS_I(page->mapping->host)->io_tree;
1644
1645                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1646                          bvec->bv_offset;
1647                 end = start + bvec->bv_len - 1;
1648
1649                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1650                         whole_page = 1;
1651                 else
1652                         whole_page = 0;
1653
1654                 if (--bvec >= bio->bi_io_vec)
1655                         prefetchw(&bvec->bv_page->flags);
1656                 if (tree->ops && tree->ops->writepage_end_io_hook) {
1657                         ret = tree->ops->writepage_end_io_hook(page, start,
1658                                                        end, NULL, uptodate);
1659                         if (ret)
1660                                 uptodate = 0;
1661                 }
1662
1663                 if (!uptodate && tree->ops &&
1664                     tree->ops->writepage_io_failed_hook) {
1665                         ret = tree->ops->writepage_io_failed_hook(bio, page,
1666                                                          start, end, NULL);
1667                         if (ret == 0) {
1668                                 uptodate = (err == 0);
1669                                 continue;
1670                         }
1671                 }
1672
1673                 if (!uptodate) {
1674                         clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1675                         ClearPageUptodate(page);
1676                         SetPageError(page);
1677                 }
1678
1679                 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1680
1681                 if (whole_page)
1682                         end_page_writeback(page);
1683                 else
1684                         check_page_writeback(tree, page);
1685         } while (bvec >= bio->bi_io_vec);
1686
1687         bio_put(bio);
1688 }
1689
1690 /*
1691  * after a readpage IO is done, we need to:
1692  * clear the uptodate bits on error
1693  * set the uptodate bits if things worked
1694  * set the page up to date if all extents in the tree are uptodate
1695  * clear the lock bit in the extent tree
1696  * unlock the page if there are no other extents locked for it
1697  *
1698  * Scheduling is not allowed, so the extent state tree is expected
1699  * to have one and only one object corresponding to this IO.
1700  */
1701 static void end_bio_extent_readpage(struct bio *bio, int err)
1702 {
1703         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1704         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1705         struct extent_io_tree *tree;
1706         u64 start;
1707         u64 end;
1708         int whole_page;
1709         int ret;
1710
1711         do {
1712                 struct page *page = bvec->bv_page;
1713                 tree = &BTRFS_I(page->mapping->host)->io_tree;
1714
1715                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1716                         bvec->bv_offset;
1717                 end = start + bvec->bv_len - 1;
1718
1719                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1720                         whole_page = 1;
1721                 else
1722                         whole_page = 0;
1723
1724                 if (--bvec >= bio->bi_io_vec)
1725                         prefetchw(&bvec->bv_page->flags);
1726
1727                 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1728                         ret = tree->ops->readpage_end_io_hook(page, start, end,
1729                                                               NULL);
1730                         if (ret)
1731                                 uptodate = 0;
1732                 }
1733                 if (!uptodate && tree->ops &&
1734                     tree->ops->readpage_io_failed_hook) {
1735                         ret = tree->ops->readpage_io_failed_hook(bio, page,
1736                                                          start, end, NULL);
1737                         if (ret == 0) {
1738                                 uptodate =
1739                                         test_bit(BIO_UPTODATE, &bio->bi_flags);
1740                                 continue;
1741                         }
1742                 }
1743
1744                 if (uptodate)
1745                         set_extent_uptodate(tree, start, end,
1746                                             GFP_ATOMIC);
1747                 unlock_extent(tree, start, end, GFP_ATOMIC);
1748
1749                 if (whole_page) {
1750                         if (uptodate) {
1751                                 SetPageUptodate(page);
1752                         } else {
1753                                 ClearPageUptodate(page);
1754                                 SetPageError(page);
1755                         }
1756                         unlock_page(page);
1757                 } else {
1758                         if (uptodate) {
1759                                 check_page_uptodate(tree, page);
1760                         } else {
1761                                 ClearPageUptodate(page);
1762                                 SetPageError(page);
1763                         }
1764                         check_page_locked(tree, page);
1765                 }
1766         } while (bvec >= bio->bi_io_vec);
1767
1768         bio_put(bio);
1769 }
1770
1771 /*
1772  * IO done from prepare_write is pretty simple, we just unlock
1773  * the structs in the extent tree when done, and set the uptodate bits
1774  * as appropriate.
1775  */
1776 static void end_bio_extent_preparewrite(struct bio *bio, int err)
1777 {
1778         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1779         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1780         struct extent_io_tree *tree;
1781         u64 start;
1782         u64 end;
1783
1784         do {
1785                 struct page *page = bvec->bv_page;
1786                 tree = &BTRFS_I(page->mapping->host)->io_tree;
1787
1788                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1789                         bvec->bv_offset;
1790                 end = start + bvec->bv_len - 1;
1791
1792                 if (--bvec >= bio->bi_io_vec)
1793                         prefetchw(&bvec->bv_page->flags);
1794
1795                 if (uptodate) {
1796                         set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1797                 } else {
1798                         ClearPageUptodate(page);
1799                         SetPageError(page);
1800                 }
1801
1802                 unlock_extent(tree, start, end, GFP_ATOMIC);
1803
1804         } while (bvec >= bio->bi_io_vec);
1805
1806         bio_put(bio);
1807 }
1808
1809 static struct bio *
1810 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1811                  gfp_t gfp_flags)
1812 {
1813         struct bio *bio;
1814
1815         bio = bio_alloc(gfp_flags, nr_vecs);
1816
1817         if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1818                 while (!bio && (nr_vecs /= 2))
1819                         bio = bio_alloc(gfp_flags, nr_vecs);
1820         }
1821
1822         if (bio) {
1823                 bio->bi_size = 0;
1824                 bio->bi_bdev = bdev;
1825                 bio->bi_sector = first_sector;
1826         }
1827         return bio;
1828 }
1829
1830 static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1831                           unsigned long bio_flags)
1832 {
1833         int ret = 0;
1834         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1835         struct page *page = bvec->bv_page;
1836         struct extent_io_tree *tree = bio->bi_private;
1837         u64 start;
1838         u64 end;
1839
1840         start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1841         end = start + bvec->bv_len - 1;
1842
1843         bio->bi_private = NULL;
1844
1845         bio_get(bio);
1846
1847         if (tree->ops && tree->ops->submit_bio_hook)
1848                 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1849                                            mirror_num, bio_flags);
1850         else
1851                 submit_bio(rw, bio);
1852         if (bio_flagged(bio, BIO_EOPNOTSUPP))
1853                 ret = -EOPNOTSUPP;
1854         bio_put(bio);
1855         return ret;
1856 }
1857
1858 static int submit_extent_page(int rw, struct extent_io_tree *tree,
1859                               struct page *page, sector_t sector,
1860                               size_t size, unsigned long offset,
1861                               struct block_device *bdev,
1862                               struct bio **bio_ret,
1863                               unsigned long max_pages,
1864                               bio_end_io_t end_io_func,
1865                               int mirror_num,
1866                               unsigned long prev_bio_flags,
1867                               unsigned long bio_flags)
1868 {
1869         int ret = 0;
1870         struct bio *bio;
1871         int nr;
1872         int contig = 0;
1873         int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1874         int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1875         size_t page_size = min(size, PAGE_CACHE_SIZE);
1876
1877         if (bio_ret && *bio_ret) {
1878                 bio = *bio_ret;
1879                 if (old_compressed)
1880                         contig = bio->bi_sector == sector;
1881                 else
1882                         contig = bio->bi_sector + (bio->bi_size >> 9) ==
1883                                 sector;
1884
1885                 if (prev_bio_flags != bio_flags || !contig ||
1886                     (tree->ops && tree->ops->merge_bio_hook &&
1887                      tree->ops->merge_bio_hook(page, offset, page_size, bio,
1888                                                bio_flags)) ||
1889                     bio_add_page(bio, page, page_size, offset) < page_size) {
1890                         ret = submit_one_bio(rw, bio, mirror_num,
1891                                              prev_bio_flags);
1892                         bio = NULL;
1893                 } else {
1894                         return 0;
1895                 }
1896         }
1897         if (this_compressed)
1898                 nr = BIO_MAX_PAGES;
1899         else
1900                 nr = bio_get_nr_vecs(bdev);
1901
1902         bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1903         if (!bio) {
1904                 printk("failed to allocate bio nr %d\n", nr);
1905         }
1906
1907         bio_add_page(bio, page, page_size, offset);
1908         bio->bi_end_io = end_io_func;
1909         bio->bi_private = tree;
1910
1911         if (bio_ret) {
1912                 *bio_ret = bio;
1913         } else {
1914                 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1915         }
1916
1917         return ret;
1918 }
1919
1920 void set_page_extent_mapped(struct page *page)
1921 {
1922         if (!PagePrivate(page)) {
1923                 SetPagePrivate(page);
1924                 page_cache_get(page);
1925                 set_page_private(page, EXTENT_PAGE_PRIVATE);
1926         }
1927 }
1928
1929 void set_page_extent_head(struct page *page, unsigned long len)
1930 {
1931         set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1932 }
1933
1934 /*
1935  * basic readpage implementation.  Locked extent state structs are inserted
1936  * into the tree that are removed when the IO is done (by the end_io
1937  * handlers)
1938  */
1939 static int __extent_read_full_page(struct extent_io_tree *tree,
1940                                    struct page *page,
1941                                    get_extent_t *get_extent,
1942                                    struct bio **bio, int mirror_num,
1943                                    unsigned long *bio_flags)
1944 {
1945         struct inode *inode = page->mapping->host;
1946         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1947         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1948         u64 end;
1949         u64 cur = start;
1950         u64 extent_offset;
1951         u64 last_byte = i_size_read(inode);
1952         u64 block_start;
1953         u64 cur_end;
1954         sector_t sector;
1955         struct extent_map *em;
1956         struct block_device *bdev;
1957         int ret;
1958         int nr = 0;
1959         size_t page_offset = 0;
1960         size_t iosize;
1961         size_t disk_io_size;
1962         size_t blocksize = inode->i_sb->s_blocksize;
1963         unsigned long this_bio_flag = 0;
1964
1965         set_page_extent_mapped(page);
1966
1967         end = page_end;
1968         lock_extent(tree, start, end, GFP_NOFS);
1969
1970         if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1971                 char *userpage;
1972                 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1973
1974                 if (zero_offset) {
1975                         iosize = PAGE_CACHE_SIZE - zero_offset;
1976                         userpage = kmap_atomic(page, KM_USER0);
1977                         memset(userpage + zero_offset, 0, iosize);
1978                         flush_dcache_page(page);
1979                         kunmap_atomic(userpage, KM_USER0);
1980                 }
1981         }
1982         while (cur <= end) {
1983                 if (cur >= last_byte) {
1984                         char *userpage;
1985                         iosize = PAGE_CACHE_SIZE - page_offset;
1986                         userpage = kmap_atomic(page, KM_USER0);
1987                         memset(userpage + page_offset, 0, iosize);
1988                         flush_dcache_page(page);
1989                         kunmap_atomic(userpage, KM_USER0);
1990                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1991                                             GFP_NOFS);
1992                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1993                         break;
1994                 }
1995                 em = get_extent(inode, page, page_offset, cur,
1996                                 end - cur + 1, 0);
1997                 if (IS_ERR(em) || !em) {
1998                         SetPageError(page);
1999                         unlock_extent(tree, cur, end, GFP_NOFS);
2000                         break;
2001                 }
2002                 extent_offset = cur - em->start;
2003                 if (extent_map_end(em) <= cur) {
2004 printk("bad mapping em [%Lu %Lu] cur %Lu\n", em->start, extent_map_end(em), cur);
2005                 }
2006                 BUG_ON(extent_map_end(em) <= cur);
2007                 if (end < cur) {
2008 printk("2bad mapping end %Lu cur %Lu\n", end, cur);
2009                 }
2010                 BUG_ON(end < cur);
2011
2012                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2013                         this_bio_flag = EXTENT_BIO_COMPRESSED;
2014
2015                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2016                 cur_end = min(extent_map_end(em) - 1, end);
2017                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2018                 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2019                         disk_io_size = em->block_len;
2020                         sector = em->block_start >> 9;
2021                 } else {
2022                         sector = (em->block_start + extent_offset) >> 9;
2023                         disk_io_size = iosize;
2024                 }
2025                 bdev = em->bdev;
2026                 block_start = em->block_start;
2027                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2028                         block_start = EXTENT_MAP_HOLE;
2029                 free_extent_map(em);
2030                 em = NULL;
2031
2032                 /* we've found a hole, just zero and go on */
2033                 if (block_start == EXTENT_MAP_HOLE) {
2034                         char *userpage;
2035                         userpage = kmap_atomic(page, KM_USER0);
2036                         memset(userpage + page_offset, 0, iosize);
2037                         flush_dcache_page(page);
2038                         kunmap_atomic(userpage, KM_USER0);
2039
2040                         set_extent_uptodate(tree, cur, cur + iosize - 1,
2041                                             GFP_NOFS);
2042                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2043                         cur = cur + iosize;
2044                         page_offset += iosize;
2045                         continue;
2046                 }
2047                 /* the get_extent function already copied into the page */
2048                 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
2049                         check_page_uptodate(tree, page);
2050                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2051                         cur = cur + iosize;
2052                         page_offset += iosize;
2053                         continue;
2054                 }
2055                 /* we have an inline extent but it didn't get marked up
2056                  * to date.  Error out
2057                  */
2058                 if (block_start == EXTENT_MAP_INLINE) {
2059                         SetPageError(page);
2060                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2061                         cur = cur + iosize;
2062                         page_offset += iosize;
2063                         continue;
2064                 }
2065
2066                 ret = 0;
2067                 if (tree->ops && tree->ops->readpage_io_hook) {
2068                         ret = tree->ops->readpage_io_hook(page, cur,
2069                                                           cur + iosize - 1);
2070                 }
2071                 if (!ret) {
2072                         unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2073                         pnr -= page->index;
2074                         ret = submit_extent_page(READ, tree, page,
2075                                          sector, disk_io_size, page_offset,
2076                                          bdev, bio, pnr,
2077                                          end_bio_extent_readpage, mirror_num,
2078                                          *bio_flags,
2079                                          this_bio_flag);
2080                         nr++;
2081                         *bio_flags = this_bio_flag;
2082                 }
2083                 if (ret)
2084                         SetPageError(page);
2085                 cur = cur + iosize;
2086                 page_offset += iosize;
2087         }
2088         if (!nr) {
2089                 if (!PageError(page))
2090                         SetPageUptodate(page);
2091                 unlock_page(page);
2092         }
2093         return 0;
2094 }
2095
2096 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2097                             get_extent_t *get_extent)
2098 {
2099         struct bio *bio = NULL;
2100         unsigned long bio_flags = 0;
2101         int ret;
2102
2103         ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2104                                       &bio_flags);
2105         if (bio)
2106                 submit_one_bio(READ, bio, 0, bio_flags);
2107         return ret;
2108 }
2109 EXPORT_SYMBOL(extent_read_full_page);
2110
2111 /*
2112  * the writepage semantics are similar to regular writepage.  extent
2113  * records are inserted to lock ranges in the tree, and as dirty areas
2114  * are found, they are marked writeback.  Then the lock bits are removed
2115  * and the end_io handler clears the writeback ranges
2116  */
2117 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2118                               void *data)
2119 {
2120         struct inode *inode = page->mapping->host;
2121         struct extent_page_data *epd = data;
2122         struct extent_io_tree *tree = epd->tree;
2123         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2124         u64 delalloc_start;
2125         u64 page_end = start + PAGE_CACHE_SIZE - 1;
2126         u64 end;
2127         u64 cur = start;
2128         u64 extent_offset;
2129         u64 last_byte = i_size_read(inode);
2130         u64 block_start;
2131         u64 iosize;
2132         u64 unlock_start;
2133         sector_t sector;
2134         struct extent_map *em;
2135         struct block_device *bdev;
2136         int ret;
2137         int nr = 0;
2138         size_t pg_offset = 0;
2139         size_t blocksize;
2140         loff_t i_size = i_size_read(inode);
2141         unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2142         u64 nr_delalloc;
2143         u64 delalloc_end;
2144         int page_started;
2145         int compressed;
2146
2147         WARN_ON(!PageLocked(page));
2148         pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2149         if (page->index > end_index ||
2150            (page->index == end_index && !pg_offset)) {
2151                 page->mapping->a_ops->invalidatepage(page, 0);
2152                 unlock_page(page);
2153                 return 0;
2154         }
2155
2156         if (page->index == end_index) {
2157                 char *userpage;
2158
2159                 userpage = kmap_atomic(page, KM_USER0);
2160                 memset(userpage + pg_offset, 0,
2161                        PAGE_CACHE_SIZE - pg_offset);
2162                 kunmap_atomic(userpage, KM_USER0);
2163                 flush_dcache_page(page);
2164         }
2165         pg_offset = 0;
2166
2167         set_page_extent_mapped(page);
2168
2169         delalloc_start = start;
2170         delalloc_end = 0;
2171         page_started = 0;
2172         while(delalloc_end < page_end) {
2173                 nr_delalloc = find_lock_delalloc_range(inode, tree,
2174                                                        page,
2175                                                        &delalloc_start,
2176                                                        &delalloc_end,
2177                                                        128 * 1024 * 1024);
2178                 if (nr_delalloc == 0) {
2179                         delalloc_start = delalloc_end + 1;
2180                         continue;
2181                 }
2182                 tree->ops->fill_delalloc(inode, page, delalloc_start,
2183                                          delalloc_end, &page_started);
2184                 delalloc_start = delalloc_end + 1;
2185         }
2186
2187         /* did the fill delalloc function already unlock and start the IO? */
2188         if (page_started) {
2189                 return 0;
2190         }
2191
2192         lock_extent(tree, start, page_end, GFP_NOFS);
2193         unlock_start = start;
2194
2195         if (tree->ops && tree->ops->writepage_start_hook) {
2196                 ret = tree->ops->writepage_start_hook(page, start,
2197                                                       page_end);
2198                 if (ret == -EAGAIN) {
2199                         unlock_extent(tree, start, page_end, GFP_NOFS);
2200                         redirty_page_for_writepage(wbc, page);
2201                         unlock_page(page);
2202                         return 0;
2203                 }
2204         }
2205
2206         end = page_end;
2207         if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
2208                 printk("found delalloc bits after lock_extent\n");
2209         }
2210
2211         if (last_byte <= start) {
2212                 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
2213                 unlock_extent(tree, start, page_end, GFP_NOFS);
2214                 if (tree->ops && tree->ops->writepage_end_io_hook)
2215                         tree->ops->writepage_end_io_hook(page, start,
2216                                                          page_end, NULL, 1);
2217                 unlock_start = page_end + 1;
2218                 goto done;
2219         }
2220
2221         set_extent_uptodate(tree, start, page_end, GFP_NOFS);
2222         blocksize = inode->i_sb->s_blocksize;
2223
2224         while (cur <= end) {
2225                 if (cur >= last_byte) {
2226                         clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
2227                         unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2228                         if (tree->ops && tree->ops->writepage_end_io_hook)
2229                                 tree->ops->writepage_end_io_hook(page, cur,
2230                                                          page_end, NULL, 1);
2231                         unlock_start = page_end + 1;
2232                         break;
2233                 }
2234                 em = epd->get_extent(inode, page, pg_offset, cur,
2235                                      end - cur + 1, 1);
2236                 if (IS_ERR(em) || !em) {
2237                         SetPageError(page);
2238                         break;
2239                 }
2240
2241                 extent_offset = cur - em->start;
2242                 BUG_ON(extent_map_end(em) <= cur);
2243                 BUG_ON(end < cur);
2244                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2245                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2246                 sector = (em->block_start + extent_offset) >> 9;
2247                 bdev = em->bdev;
2248                 block_start = em->block_start;
2249                 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2250                 free_extent_map(em);
2251                 em = NULL;
2252
2253                 /*
2254                  * compressed and inline extents are written through other
2255                  * paths in the FS
2256                  */
2257                 if (compressed || block_start == EXTENT_MAP_HOLE ||
2258                     block_start == EXTENT_MAP_INLINE) {
2259                         clear_extent_dirty(tree, cur,
2260                                            cur + iosize - 1, GFP_NOFS);
2261
2262                         unlock_extent(tree, unlock_start, cur + iosize -1,
2263                                       GFP_NOFS);
2264
2265                         /*
2266                          * end_io notification does not happen here for
2267                          * compressed extents
2268                          */
2269                         if (!compressed && tree->ops &&
2270                             tree->ops->writepage_end_io_hook)
2271                                 tree->ops->writepage_end_io_hook(page, cur,
2272                                                          cur + iosize - 1,
2273                                                          NULL, 1);
2274                         else if (compressed) {
2275                                 /* we don't want to end_page_writeback on
2276                                  * a compressed extent.  this happens
2277                                  * elsewhere
2278                                  */
2279                                 nr++;
2280                         }
2281
2282                         cur += iosize;
2283                         pg_offset += iosize;
2284                         unlock_start = cur;
2285                         continue;
2286                 }
2287                 /* leave this out until we have a page_mkwrite call */
2288                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2289                                    EXTENT_DIRTY, 0)) {
2290                         cur = cur + iosize;
2291                         pg_offset += iosize;
2292                         continue;
2293                 }
2294
2295                 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2296                 if (tree->ops && tree->ops->writepage_io_hook) {
2297                         ret = tree->ops->writepage_io_hook(page, cur,
2298                                                 cur + iosize - 1);
2299                 } else {
2300                         ret = 0;
2301                 }
2302                 if (ret) {
2303                         SetPageError(page);
2304                 } else {
2305                         unsigned long max_nr = end_index + 1;
2306
2307                         set_range_writeback(tree, cur, cur + iosize - 1);
2308                         if (!PageWriteback(page)) {
2309                                 printk("warning page %lu not writeback, "
2310                                        "cur %llu end %llu\n", page->index,
2311                                        (unsigned long long)cur,
2312                                        (unsigned long long)end);
2313                         }
2314
2315                         ret = submit_extent_page(WRITE, tree, page, sector,
2316                                                  iosize, pg_offset, bdev,
2317                                                  &epd->bio, max_nr,
2318                                                  end_bio_extent_writepage,
2319                                                  0, 0, 0);
2320                         if (ret)
2321                                 SetPageError(page);
2322                 }
2323                 cur = cur + iosize;
2324                 pg_offset += iosize;
2325                 nr++;
2326         }
2327 done:
2328         if (nr == 0) {
2329                 /* make sure the mapping tag for page dirty gets cleared */
2330                 set_page_writeback(page);
2331                 end_page_writeback(page);
2332         }
2333         if (unlock_start <= page_end)
2334                 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2335         unlock_page(page);
2336         return 0;
2337 }
2338
2339 /**
2340  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2341  * @mapping: address space structure to write
2342  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2343  * @writepage: function called for each page
2344  * @data: data passed to writepage function
2345  *
2346  * If a page is already under I/O, write_cache_pages() skips it, even
2347  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2348  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2349  * and msync() need to guarantee that all the data which was dirty at the time
2350  * the call was made get new I/O started against them.  If wbc->sync_mode is
2351  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2352  * existing IO to complete.
2353  */
2354 int extent_write_cache_pages(struct extent_io_tree *tree,
2355                              struct address_space *mapping,
2356                              struct writeback_control *wbc,
2357                              writepage_t writepage, void *data)
2358 {
2359         struct backing_dev_info *bdi = mapping->backing_dev_info;
2360         int ret = 0;
2361         int done = 0;
2362         struct pagevec pvec;
2363         int nr_pages;
2364         pgoff_t index;
2365         pgoff_t end;            /* Inclusive */
2366         int scanned = 0;
2367         int range_whole = 0;
2368
2369         if (wbc->nonblocking && bdi_write_congested(bdi)) {
2370                 wbc->encountered_congestion = 1;
2371                 return 0;
2372         }
2373
2374         pagevec_init(&pvec, 0);
2375         if (wbc->range_cyclic) {
2376                 index = mapping->writeback_index; /* Start from prev offset */
2377                 end = -1;
2378         } else {
2379                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2380                 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2381                 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2382                         range_whole = 1;
2383                 scanned = 1;
2384         }
2385 retry:
2386         while (!done && (index <= end) &&
2387                (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2388                                               PAGECACHE_TAG_DIRTY,
2389                                               min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2390                 unsigned i;
2391
2392                 scanned = 1;
2393                 for (i = 0; i < nr_pages; i++) {
2394                         struct page *page = pvec.pages[i];
2395
2396                         /*
2397                          * At this point we hold neither mapping->tree_lock nor
2398                          * lock on the page itself: the page may be truncated or
2399                          * invalidated (changing page->mapping to NULL), or even
2400                          * swizzled back from swapper_space to tmpfs file
2401                          * mapping
2402                          */
2403                         if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2404                                 tree->ops->write_cache_pages_lock_hook(page);
2405                         else
2406                                 lock_page(page);
2407
2408                         if (unlikely(page->mapping != mapping)) {
2409                                 unlock_page(page);
2410                                 continue;
2411                         }
2412
2413                         if (!wbc->range_cyclic && page->index > end) {
2414                                 done = 1;
2415                                 unlock_page(page);
2416                                 continue;
2417                         }
2418
2419                         if (wbc->sync_mode != WB_SYNC_NONE)
2420                                 wait_on_page_writeback(page);
2421
2422                         if (PageWriteback(page) ||
2423                             !clear_page_dirty_for_io(page)) {
2424                                 unlock_page(page);
2425                                 continue;
2426                         }
2427
2428                         ret = (*writepage)(page, wbc, data);
2429
2430                         if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2431                                 unlock_page(page);
2432                                 ret = 0;
2433                         }
2434                         if (ret || (--(wbc->nr_to_write) <= 0))
2435                                 done = 1;
2436                         if (wbc->nonblocking && bdi_write_congested(bdi)) {
2437                                 wbc->encountered_congestion = 1;
2438                                 done = 1;
2439                         }
2440                 }
2441                 pagevec_release(&pvec);
2442                 cond_resched();
2443         }
2444         if (!scanned && !done) {
2445                 /*
2446                  * We hit the last page and there is more work to be done: wrap
2447                  * back to the start of the file
2448                  */
2449                 scanned = 1;
2450                 index = 0;
2451                 goto retry;
2452         }
2453         if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2454                 mapping->writeback_index = index;
2455
2456         if (wbc->range_cont)
2457                 wbc->range_start = index << PAGE_CACHE_SHIFT;
2458         return ret;
2459 }
2460 EXPORT_SYMBOL(extent_write_cache_pages);
2461
2462 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2463                           get_extent_t *get_extent,
2464                           struct writeback_control *wbc)
2465 {
2466         int ret;
2467         struct address_space *mapping = page->mapping;
2468         struct extent_page_data epd = {
2469                 .bio = NULL,
2470                 .tree = tree,
2471                 .get_extent = get_extent,
2472         };
2473         struct writeback_control wbc_writepages = {
2474                 .bdi            = wbc->bdi,
2475                 .sync_mode      = WB_SYNC_NONE,
2476                 .older_than_this = NULL,
2477                 .nr_to_write    = 64,
2478                 .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
2479                 .range_end      = (loff_t)-1,
2480         };
2481
2482
2483         ret = __extent_writepage(page, wbc, &epd);
2484
2485         extent_write_cache_pages(tree, mapping, &wbc_writepages,
2486                                  __extent_writepage, &epd);
2487         if (epd.bio) {
2488                 submit_one_bio(WRITE, epd.bio, 0, 0);
2489         }
2490         return ret;
2491 }
2492 EXPORT_SYMBOL(extent_write_full_page);
2493
2494
2495 int extent_writepages(struct extent_io_tree *tree,
2496                       struct address_space *mapping,
2497                       get_extent_t *get_extent,
2498                       struct writeback_control *wbc)
2499 {
2500         int ret = 0;
2501         struct extent_page_data epd = {
2502                 .bio = NULL,
2503                 .tree = tree,
2504                 .get_extent = get_extent,
2505         };
2506
2507         ret = extent_write_cache_pages(tree, mapping, wbc,
2508                                        __extent_writepage, &epd);
2509         if (epd.bio) {
2510                 submit_one_bio(WRITE, epd.bio, 0, 0);
2511         }
2512         return ret;
2513 }
2514 EXPORT_SYMBOL(extent_writepages);
2515
2516 int extent_readpages(struct extent_io_tree *tree,
2517                      struct address_space *mapping,
2518                      struct list_head *pages, unsigned nr_pages,
2519                      get_extent_t get_extent)
2520 {
2521         struct bio *bio = NULL;
2522         unsigned page_idx;
2523         struct pagevec pvec;
2524         unsigned long bio_flags = 0;
2525
2526         pagevec_init(&pvec, 0);
2527         for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2528                 struct page *page = list_entry(pages->prev, struct page, lru);
2529
2530                 prefetchw(&page->flags);
2531                 list_del(&page->lru);
2532                 /*
2533                  * what we want to do here is call add_to_page_cache_lru,
2534                  * but that isn't exported, so we reproduce it here
2535                  */
2536                 if (!add_to_page_cache(page, mapping,
2537                                         page->index, GFP_KERNEL)) {
2538
2539                         /* open coding of lru_cache_add, also not exported */
2540                         page_cache_get(page);
2541                         if (!pagevec_add(&pvec, page))
2542                                 __pagevec_lru_add(&pvec);
2543                         __extent_read_full_page(tree, page, get_extent,
2544                                                 &bio, 0, &bio_flags);
2545                 }
2546                 page_cache_release(page);
2547         }
2548         if (pagevec_count(&pvec))
2549                 __pagevec_lru_add(&pvec);
2550         BUG_ON(!list_empty(pages));
2551         if (bio)
2552                 submit_one_bio(READ, bio, 0, bio_flags);
2553         return 0;
2554 }
2555 EXPORT_SYMBOL(extent_readpages);
2556
2557 /*
2558  * basic invalidatepage code, this waits on any locked or writeback
2559  * ranges corresponding to the page, and then deletes any extent state
2560  * records from the tree
2561  */
2562 int extent_invalidatepage(struct extent_io_tree *tree,
2563                           struct page *page, unsigned long offset)
2564 {
2565         u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2566         u64 end = start + PAGE_CACHE_SIZE - 1;
2567         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2568
2569         start += (offset + blocksize -1) & ~(blocksize - 1);
2570         if (start > end)
2571                 return 0;
2572
2573         lock_extent(tree, start, end, GFP_NOFS);
2574         wait_on_extent_writeback(tree, start, end);
2575         clear_extent_bit(tree, start, end,
2576                          EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2577                          1, 1, GFP_NOFS);
2578         return 0;
2579 }
2580 EXPORT_SYMBOL(extent_invalidatepage);
2581
2582 /*
2583  * simple commit_write call, set_range_dirty is used to mark both
2584  * the pages and the extent records as dirty
2585  */
2586 int extent_commit_write(struct extent_io_tree *tree,
2587                         struct inode *inode, struct page *page,
2588                         unsigned from, unsigned to)
2589 {
2590         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2591
2592         set_page_extent_mapped(page);
2593         set_page_dirty(page);
2594
2595         if (pos > inode->i_size) {
2596                 i_size_write(inode, pos);
2597                 mark_inode_dirty(inode);
2598         }
2599         return 0;
2600 }
2601 EXPORT_SYMBOL(extent_commit_write);
2602
2603 int extent_prepare_write(struct extent_io_tree *tree,
2604                          struct inode *inode, struct page *page,
2605                          unsigned from, unsigned to, get_extent_t *get_extent)
2606 {
2607         u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2608         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2609         u64 block_start;
2610         u64 orig_block_start;
2611         u64 block_end;
2612         u64 cur_end;
2613         struct extent_map *em;
2614         unsigned blocksize = 1 << inode->i_blkbits;
2615         size_t page_offset = 0;
2616         size_t block_off_start;
2617         size_t block_off_end;
2618         int err = 0;
2619         int iocount = 0;
2620         int ret = 0;
2621         int isnew;
2622
2623         set_page_extent_mapped(page);
2624
2625         block_start = (page_start + from) & ~((u64)blocksize - 1);
2626         block_end = (page_start + to - 1) | (blocksize - 1);
2627         orig_block_start = block_start;
2628
2629         lock_extent(tree, page_start, page_end, GFP_NOFS);
2630         while(block_start <= block_end) {
2631                 em = get_extent(inode, page, page_offset, block_start,
2632                                 block_end - block_start + 1, 1);
2633                 if (IS_ERR(em) || !em) {
2634                         goto err;
2635                 }
2636                 cur_end = min(block_end, extent_map_end(em) - 1);
2637                 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2638                 block_off_end = block_off_start + blocksize;
2639                 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2640
2641                 if (!PageUptodate(page) && isnew &&
2642                     (block_off_end > to || block_off_start < from)) {
2643                         void *kaddr;
2644
2645                         kaddr = kmap_atomic(page, KM_USER0);
2646                         if (block_off_end > to)
2647                                 memset(kaddr + to, 0, block_off_end - to);
2648                         if (block_off_start < from)
2649                                 memset(kaddr + block_off_start, 0,
2650                                        from - block_off_start);
2651                         flush_dcache_page(page);
2652                         kunmap_atomic(kaddr, KM_USER0);
2653                 }
2654                 if ((em->block_start != EXTENT_MAP_HOLE &&
2655                      em->block_start != EXTENT_MAP_INLINE) &&
2656                     !isnew && !PageUptodate(page) &&
2657                     (block_off_end > to || block_off_start < from) &&
2658                     !test_range_bit(tree, block_start, cur_end,
2659                                     EXTENT_UPTODATE, 1)) {
2660                         u64 sector;
2661                         u64 extent_offset = block_start - em->start;
2662                         size_t iosize;
2663                         sector = (em->block_start + extent_offset) >> 9;
2664                         iosize = (cur_end - block_start + blocksize) &
2665                                 ~((u64)blocksize - 1);
2666                         /*
2667                          * we've already got the extent locked, but we
2668                          * need to split the state such that our end_bio
2669                          * handler can clear the lock.
2670                          */
2671                         set_extent_bit(tree, block_start,
2672                                        block_start + iosize - 1,
2673                                        EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2674                         ret = submit_extent_page(READ, tree, page,
2675                                          sector, iosize, page_offset, em->bdev,
2676                                          NULL, 1,
2677                                          end_bio_extent_preparewrite, 0,
2678                                          0, 0);
2679                         iocount++;
2680                         block_start = block_start + iosize;
2681                 } else {
2682                         set_extent_uptodate(tree, block_start, cur_end,
2683                                             GFP_NOFS);
2684                         unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2685                         block_start = cur_end + 1;
2686                 }
2687                 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2688                 free_extent_map(em);
2689         }
2690         if (iocount) {
2691                 wait_extent_bit(tree, orig_block_start,
2692                                 block_end, EXTENT_LOCKED);
2693         }
2694         check_page_uptodate(tree, page);
2695 err:
2696         /* FIXME, zero out newly allocated blocks on error */
2697         return err;
2698 }
2699 EXPORT_SYMBOL(extent_prepare_write);
2700
2701 /*
2702  * a helper for releasepage, this tests for areas of the page that
2703  * are locked or under IO and drops the related state bits if it is safe
2704  * to drop the page.
2705  */
2706 int try_release_extent_state(struct extent_map_tree *map,
2707                              struct extent_io_tree *tree, struct page *page,
2708                              gfp_t mask)
2709 {
2710         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2711         u64 end = start + PAGE_CACHE_SIZE - 1;
2712         int ret = 1;
2713
2714         if (test_range_bit(tree, start, end,
2715                            EXTENT_IOBITS | EXTENT_ORDERED, 0))
2716                 ret = 0;
2717         else {
2718                 if ((mask & GFP_NOFS) == GFP_NOFS)
2719                         mask = GFP_NOFS;
2720                 clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
2721                                  1, 1, mask);
2722         }
2723         return ret;
2724 }
2725 EXPORT_SYMBOL(try_release_extent_state);
2726
2727 /*
2728  * a helper for releasepage.  As long as there are no locked extents
2729  * in the range corresponding to the page, both state records and extent
2730  * map records are removed
2731  */
2732 int try_release_extent_mapping(struct extent_map_tree *map,
2733                                struct extent_io_tree *tree, struct page *page,
2734                                gfp_t mask)
2735 {
2736         struct extent_map *em;
2737         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2738         u64 end = start + PAGE_CACHE_SIZE - 1;
2739
2740         if ((mask & __GFP_WAIT) &&
2741             page->mapping->host->i_size > 16 * 1024 * 1024) {
2742                 u64 len;
2743                 while (start <= end) {
2744                         len = end - start + 1;
2745                         spin_lock(&map->lock);
2746                         em = lookup_extent_mapping(map, start, len);
2747                         if (!em || IS_ERR(em)) {
2748                                 spin_unlock(&map->lock);
2749                                 break;
2750                         }
2751                         if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2752                             em->start != start) {
2753                                 spin_unlock(&map->lock);
2754                                 free_extent_map(em);
2755                                 break;
2756                         }
2757                         if (!test_range_bit(tree, em->start,
2758                                             extent_map_end(em) - 1,
2759                                             EXTENT_LOCKED | EXTENT_WRITEBACK |
2760                                             EXTENT_ORDERED,
2761                                             0)) {
2762                                 remove_extent_mapping(map, em);
2763                                 /* once for the rb tree */
2764                                 free_extent_map(em);
2765                         }
2766                         start = extent_map_end(em);
2767                         spin_unlock(&map->lock);
2768
2769                         /* once for us */
2770                         free_extent_map(em);
2771                 }
2772         }
2773         return try_release_extent_state(map, tree, page, mask);
2774 }
2775 EXPORT_SYMBOL(try_release_extent_mapping);
2776
2777 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2778                 get_extent_t *get_extent)
2779 {
2780         struct inode *inode = mapping->host;
2781         u64 start = iblock << inode->i_blkbits;
2782         sector_t sector = 0;
2783         size_t blksize = (1 << inode->i_blkbits);
2784         struct extent_map *em;
2785
2786         lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2787                     GFP_NOFS);
2788         em = get_extent(inode, NULL, 0, start, blksize, 0);
2789         unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2790                       GFP_NOFS);
2791         if (!em || IS_ERR(em))
2792                 return 0;
2793
2794         if (em->block_start > EXTENT_MAP_LAST_BYTE)
2795                 goto out;
2796
2797         sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2798 out:
2799         free_extent_map(em);
2800         return sector;
2801 }
2802
2803 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2804                                               unsigned long i)
2805 {
2806         struct page *p;
2807         struct address_space *mapping;
2808
2809         if (i == 0)
2810                 return eb->first_page;
2811         i += eb->start >> PAGE_CACHE_SHIFT;
2812         mapping = eb->first_page->mapping;
2813         if (!mapping)
2814                 return NULL;
2815
2816         /*
2817          * extent_buffer_page is only called after pinning the page
2818          * by increasing the reference count.  So we know the page must
2819          * be in the radix tree.
2820          */
2821         rcu_read_lock();
2822         p = radix_tree_lookup(&mapping->page_tree, i);
2823         rcu_read_unlock();
2824
2825         return p;
2826 }
2827
2828 static inline unsigned long num_extent_pages(u64 start, u64 len)
2829 {
2830         return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2831                 (start >> PAGE_CACHE_SHIFT);
2832 }
2833
2834 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2835                                                    u64 start,
2836                                                    unsigned long len,
2837                                                    gfp_t mask)
2838 {
2839         struct extent_buffer *eb = NULL;
2840 #ifdef LEAK_DEBUG
2841         unsigned long flags;
2842 #endif
2843
2844         eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2845         eb->start = start;
2846         eb->len = len;
2847         mutex_init(&eb->mutex);
2848 #ifdef LEAK_DEBUG
2849         spin_lock_irqsave(&leak_lock, flags);
2850         list_add(&eb->leak_list, &buffers);
2851         spin_unlock_irqrestore(&leak_lock, flags);
2852 #endif
2853         atomic_set(&eb->refs, 1);
2854
2855         return eb;
2856 }
2857
2858 static void __free_extent_buffer(struct extent_buffer *eb)
2859 {
2860 #ifdef LEAK_DEBUG
2861         unsigned long flags;
2862         spin_lock_irqsave(&leak_lock, flags);
2863         list_del(&eb->leak_list);
2864         spin_unlock_irqrestore(&leak_lock, flags);
2865 #endif
2866         kmem_cache_free(extent_buffer_cache, eb);
2867 }
2868
2869 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2870                                           u64 start, unsigned long len,
2871                                           struct page *page0,
2872                                           gfp_t mask)
2873 {
2874         unsigned long num_pages = num_extent_pages(start, len);
2875         unsigned long i;
2876         unsigned long index = start >> PAGE_CACHE_SHIFT;
2877         struct extent_buffer *eb;
2878         struct extent_buffer *exists = NULL;
2879         struct page *p;
2880         struct address_space *mapping = tree->mapping;
2881         int uptodate = 1;
2882
2883         spin_lock(&tree->buffer_lock);
2884         eb = buffer_search(tree, start);
2885         if (eb) {
2886                 atomic_inc(&eb->refs);
2887                 spin_unlock(&tree->buffer_lock);
2888                 mark_page_accessed(eb->first_page);
2889                 return eb;
2890         }
2891         spin_unlock(&tree->buffer_lock);
2892
2893         eb = __alloc_extent_buffer(tree, start, len, mask);
2894         if (!eb)
2895                 return NULL;
2896
2897         if (page0) {
2898                 eb->first_page = page0;
2899                 i = 1;
2900                 index++;
2901                 page_cache_get(page0);
2902                 mark_page_accessed(page0);
2903                 set_page_extent_mapped(page0);
2904                 set_page_extent_head(page0, len);
2905                 uptodate = PageUptodate(page0);
2906         } else {
2907                 i = 0;
2908         }
2909         for (; i < num_pages; i++, index++) {
2910                 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2911                 if (!p) {
2912                         WARN_ON(1);
2913                         goto free_eb;
2914                 }
2915                 set_page_extent_mapped(p);
2916                 mark_page_accessed(p);
2917                 if (i == 0) {
2918                         eb->first_page = p;
2919                         set_page_extent_head(p, len);
2920                 } else {
2921                         set_page_private(p, EXTENT_PAGE_PRIVATE);
2922                 }
2923                 if (!PageUptodate(p))
2924                         uptodate = 0;
2925                 unlock_page(p);
2926         }
2927         if (uptodate)
2928                 eb->flags |= EXTENT_UPTODATE;
2929         eb->flags |= EXTENT_BUFFER_FILLED;
2930
2931         spin_lock(&tree->buffer_lock);
2932         exists = buffer_tree_insert(tree, start, &eb->rb_node);
2933         if (exists) {
2934                 /* add one reference for the caller */
2935                 atomic_inc(&exists->refs);
2936                 spin_unlock(&tree->buffer_lock);
2937                 goto free_eb;
2938         }
2939         spin_unlock(&tree->buffer_lock);
2940
2941         /* add one reference for the tree */
2942         atomic_inc(&eb->refs);
2943         return eb;
2944
2945 free_eb:
2946         if (!atomic_dec_and_test(&eb->refs))
2947                 return exists;
2948         for (index = 1; index < i; index++)
2949                 page_cache_release(extent_buffer_page(eb, index));
2950         page_cache_release(extent_buffer_page(eb, 0));
2951         __free_extent_buffer(eb);
2952         return exists;
2953 }
2954 EXPORT_SYMBOL(alloc_extent_buffer);
2955
2956 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
2957                                          u64 start, unsigned long len,
2958                                           gfp_t mask)
2959 {
2960         struct extent_buffer *eb;
2961
2962         spin_lock(&tree->buffer_lock);
2963         eb = buffer_search(tree, start);
2964         if (eb)
2965                 atomic_inc(&eb->refs);
2966         spin_unlock(&tree->buffer_lock);
2967
2968         if (eb)
2969                 mark_page_accessed(eb->first_page);
2970
2971         return eb;
2972 }
2973 EXPORT_SYMBOL(find_extent_buffer);
2974
2975 void free_extent_buffer(struct extent_buffer *eb)
2976 {
2977         if (!eb)
2978                 return;
2979
2980         if (!atomic_dec_and_test(&eb->refs))
2981                 return;
2982
2983         WARN_ON(1);
2984 }
2985 EXPORT_SYMBOL(free_extent_buffer);
2986
2987 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
2988                               struct extent_buffer *eb)
2989 {
2990         int set;
2991         unsigned long i;
2992         unsigned long num_pages;
2993         struct page *page;
2994
2995         u64 start = eb->start;
2996         u64 end = start + eb->len - 1;
2997
2998         set = clear_extent_dirty(tree, start, end, GFP_NOFS);
2999         num_pages = num_extent_pages(eb->start, eb->len);
3000
3001         for (i = 0; i < num_pages; i++) {
3002                 page = extent_buffer_page(eb, i);
3003                 lock_page(page);
3004                 if (i == 0)
3005                         set_page_extent_head(page, eb->len);
3006                 else
3007                         set_page_private(page, EXTENT_PAGE_PRIVATE);
3008
3009                 /*
3010                  * if we're on the last page or the first page and the
3011                  * block isn't aligned on a page boundary, do extra checks
3012                  * to make sure we don't clean page that is partially dirty
3013                  */
3014                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3015                     ((i == num_pages - 1) &&
3016                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3017                         start = (u64)page->index << PAGE_CACHE_SHIFT;
3018                         end  = start + PAGE_CACHE_SIZE - 1;
3019                         if (test_range_bit(tree, start, end,
3020                                            EXTENT_DIRTY, 0)) {
3021                                 unlock_page(page);
3022                                 continue;
3023                         }
3024                 }
3025                 clear_page_dirty_for_io(page);
3026                 spin_lock_irq(&page->mapping->tree_lock);
3027                 if (!PageDirty(page)) {
3028                         radix_tree_tag_clear(&page->mapping->page_tree,
3029                                                 page_index(page),
3030                                                 PAGECACHE_TAG_DIRTY);
3031                 }
3032                 spin_unlock_irq(&page->mapping->tree_lock);
3033                 unlock_page(page);
3034         }
3035         return 0;
3036 }
3037 EXPORT_SYMBOL(clear_extent_buffer_dirty);
3038
3039 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
3040                                     struct extent_buffer *eb)
3041 {
3042         return wait_on_extent_writeback(tree, eb->start,
3043                                         eb->start + eb->len - 1);
3044 }
3045 EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
3046
3047 int set_extent_buffer_dirty(struct extent_io_tree *tree,
3048                              struct extent_buffer *eb)
3049 {
3050         unsigned long i;
3051         unsigned long num_pages;
3052
3053         num_pages = num_extent_pages(eb->start, eb->len);
3054         for (i = 0; i < num_pages; i++) {
3055                 struct page *page = extent_buffer_page(eb, i);
3056                 /* writepage may need to do something special for the
3057                  * first page, we have to make sure page->private is
3058                  * properly set.  releasepage may drop page->private
3059                  * on us if the page isn't already dirty.
3060                  */
3061                 lock_page(page);
3062                 if (i == 0) {
3063                         set_page_extent_head(page, eb->len);
3064                 } else if (PagePrivate(page) &&
3065                            page->private != EXTENT_PAGE_PRIVATE) {
3066                         set_page_extent_mapped(page);
3067                 }
3068                 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3069                 set_extent_dirty(tree, page_offset(page),
3070                                  page_offset(page) + PAGE_CACHE_SIZE -1,
3071                                  GFP_NOFS);
3072                 unlock_page(page);
3073         }
3074         return 0;
3075 }
3076 EXPORT_SYMBOL(set_extent_buffer_dirty);
3077
3078 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3079                                 struct extent_buffer *eb)
3080 {
3081         unsigned long i;
3082         struct page *page;
3083         unsigned long num_pages;
3084
3085         num_pages = num_extent_pages(eb->start, eb->len);
3086         eb->flags &= ~EXTENT_UPTODATE;
3087
3088         clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3089                               GFP_NOFS);
3090         for (i = 0; i < num_pages; i++) {
3091                 page = extent_buffer_page(eb, i);
3092                 if (page)
3093                         ClearPageUptodate(page);
3094         }
3095         return 0;
3096 }
3097
3098 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3099                                 struct extent_buffer *eb)
3100 {
3101         unsigned long i;
3102         struct page *page;
3103         unsigned long num_pages;
3104
3105         num_pages = num_extent_pages(eb->start, eb->len);
3106
3107         set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3108                             GFP_NOFS);
3109         for (i = 0; i < num_pages; i++) {
3110                 page = extent_buffer_page(eb, i);
3111                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3112                     ((i == num_pages - 1) &&
3113                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3114                         check_page_uptodate(tree, page);
3115                         continue;
3116                 }
3117                 SetPageUptodate(page);
3118         }
3119         return 0;
3120 }
3121 EXPORT_SYMBOL(set_extent_buffer_uptodate);
3122
3123 int extent_range_uptodate(struct extent_io_tree *tree,
3124                           u64 start, u64 end)
3125 {
3126         struct page *page;
3127         int ret;
3128         int pg_uptodate = 1;
3129         int uptodate;
3130         unsigned long index;
3131
3132         ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
3133         if (ret)
3134                 return 1;
3135         while(start <= end) {
3136                 index = start >> PAGE_CACHE_SHIFT;
3137                 page = find_get_page(tree->mapping, index);
3138                 uptodate = PageUptodate(page);
3139                 page_cache_release(page);
3140                 if (!uptodate) {
3141                         pg_uptodate = 0;
3142                         break;
3143                 }
3144                 start += PAGE_CACHE_SIZE;
3145         }
3146         return pg_uptodate;
3147 }
3148
3149 int extent_buffer_uptodate(struct extent_io_tree *tree,
3150                            struct extent_buffer *eb)
3151 {
3152         int ret = 0;
3153         unsigned long num_pages;
3154         unsigned long i;
3155         struct page *page;
3156         int pg_uptodate = 1;
3157
3158         if (eb->flags & EXTENT_UPTODATE)
3159                 return 1;
3160
3161         ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3162                            EXTENT_UPTODATE, 1);
3163         if (ret)
3164                 return ret;
3165
3166         num_pages = num_extent_pages(eb->start, eb->len);
3167         for (i = 0; i < num_pages; i++) {
3168                 page = extent_buffer_page(eb, i);
3169                 if (!PageUptodate(page)) {
3170                         pg_uptodate = 0;
3171                         break;
3172                 }
3173         }
3174         return pg_uptodate;
3175 }
3176 EXPORT_SYMBOL(extent_buffer_uptodate);
3177
3178 int read_extent_buffer_pages(struct extent_io_tree *tree,
3179                              struct extent_buffer *eb,
3180                              u64 start, int wait,
3181                              get_extent_t *get_extent, int mirror_num)
3182 {
3183         unsigned long i;
3184         unsigned long start_i;
3185         struct page *page;
3186         int err;
3187         int ret = 0;
3188         int locked_pages = 0;
3189         int all_uptodate = 1;
3190         int inc_all_pages = 0;
3191         unsigned long num_pages;
3192         struct bio *bio = NULL;
3193         unsigned long bio_flags = 0;
3194
3195         if (eb->flags & EXTENT_UPTODATE)
3196                 return 0;
3197
3198         if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3199                            EXTENT_UPTODATE, 1)) {
3200                 return 0;
3201         }
3202
3203         if (start) {
3204                 WARN_ON(start < eb->start);
3205                 start_i = (start >> PAGE_CACHE_SHIFT) -
3206                         (eb->start >> PAGE_CACHE_SHIFT);
3207         } else {
3208                 start_i = 0;
3209         }
3210
3211         num_pages = num_extent_pages(eb->start, eb->len);
3212         for (i = start_i; i < num_pages; i++) {
3213                 page = extent_buffer_page(eb, i);
3214                 if (!wait) {
3215                         if (!trylock_page(page))
3216                                 goto unlock_exit;
3217                 } else {
3218                         lock_page(page);
3219                 }
3220                 locked_pages++;
3221                 if (!PageUptodate(page)) {
3222                         all_uptodate = 0;
3223                 }
3224         }
3225         if (all_uptodate) {
3226                 if (start_i == 0)
3227                         eb->flags |= EXTENT_UPTODATE;
3228                 if (ret) {
3229                         printk("all up to date but ret is %d\n", ret);
3230                 }
3231                 goto unlock_exit;
3232         }
3233
3234         for (i = start_i; i < num_pages; i++) {
3235                 page = extent_buffer_page(eb, i);
3236                 if (inc_all_pages)
3237                         page_cache_get(page);
3238                 if (!PageUptodate(page)) {
3239                         if (start_i == 0)
3240                                 inc_all_pages = 1;
3241                         ClearPageError(page);
3242                         err = __extent_read_full_page(tree, page,
3243                                                       get_extent, &bio,
3244                                                       mirror_num, &bio_flags);
3245                         if (err) {
3246                                 ret = err;
3247                                 printk("err %d from __extent_read_full_page\n", ret);
3248                         }
3249                 } else {
3250                         unlock_page(page);
3251                 }
3252         }
3253
3254         if (bio)
3255                 submit_one_bio(READ, bio, mirror_num, bio_flags);
3256
3257         if (ret || !wait) {
3258                 if (ret)
3259                         printk("ret %d wait %d returning\n", ret, wait);
3260                 return ret;
3261         }
3262         for (i = start_i; i < num_pages; i++) {
3263                 page = extent_buffer_page(eb, i);
3264                 wait_on_page_locked(page);
3265                 if (!PageUptodate(page)) {
3266                         printk("page not uptodate after wait_on_page_locked\n");
3267                         ret = -EIO;
3268                 }
3269         }
3270         if (!ret)
3271                 eb->flags |= EXTENT_UPTODATE;
3272         return ret;
3273
3274 unlock_exit:
3275         i = start_i;
3276         while(locked_pages > 0) {
3277                 page = extent_buffer_page(eb, i);
3278                 i++;
3279                 unlock_page(page);
3280                 locked_pages--;
3281         }
3282         return ret;
3283 }
3284 EXPORT_SYMBOL(read_extent_buffer_pages);
3285
3286 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3287                         unsigned long start,
3288                         unsigned long len)
3289 {
3290         size_t cur;
3291         size_t offset;
3292         struct page *page;
3293         char *kaddr;
3294         char *dst = (char *)dstv;
3295         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3296         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3297
3298         WARN_ON(start > eb->len);
3299         WARN_ON(start + len > eb->start + eb->len);
3300
3301         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3302
3303         while(len > 0) {
3304                 page = extent_buffer_page(eb, i);
3305
3306                 cur = min(len, (PAGE_CACHE_SIZE - offset));
3307                 kaddr = kmap_atomic(page, KM_USER1);
3308                 memcpy(dst, kaddr + offset, cur);
3309                 kunmap_atomic(kaddr, KM_USER1);
3310
3311                 dst += cur;
3312                 len -= cur;
3313                 offset = 0;
3314                 i++;
3315         }
3316 }
3317 EXPORT_SYMBOL(read_extent_buffer);
3318
3319 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3320                                unsigned long min_len, char **token, char **map,
3321                                unsigned long *map_start,
3322                                unsigned long *map_len, int km)
3323 {
3324         size_t offset = start & (PAGE_CACHE_SIZE - 1);
3325         char *kaddr;
3326         struct page *p;
3327         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3328         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3329         unsigned long end_i = (start_offset + start + min_len - 1) >>
3330                 PAGE_CACHE_SHIFT;
3331
3332         if (i != end_i)
3333                 return -EINVAL;
3334
3335         if (i == 0) {
3336                 offset = start_offset;
3337                 *map_start = 0;
3338         } else {
3339                 offset = 0;
3340                 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3341         }
3342         if (start + min_len > eb->len) {
3343 printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
3344                 WARN_ON(1);
3345         }
3346
3347         p = extent_buffer_page(eb, i);
3348         kaddr = kmap_atomic(p, km);
3349         *token = kaddr;
3350         *map = kaddr + offset;
3351         *map_len = PAGE_CACHE_SIZE - offset;
3352         return 0;
3353 }
3354 EXPORT_SYMBOL(map_private_extent_buffer);
3355
3356 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3357                       unsigned long min_len,
3358                       char **token, char **map,
3359                       unsigned long *map_start,
3360                       unsigned long *map_len, int km)
3361 {
3362         int err;
3363         int save = 0;
3364         if (eb->map_token) {
3365                 unmap_extent_buffer(eb, eb->map_token, km);
3366                 eb->map_token = NULL;
3367                 save = 1;
3368         }
3369         err = map_private_extent_buffer(eb, start, min_len, token, map,
3370                                        map_start, map_len, km);
3371         if (!err && save) {
3372                 eb->map_token = *token;
3373                 eb->kaddr = *map;
3374                 eb->map_start = *map_start;
3375                 eb->map_len = *map_len;
3376         }
3377         return err;
3378 }
3379 EXPORT_SYMBOL(map_extent_buffer);
3380
3381 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3382 {
3383         kunmap_atomic(token, km);
3384 }
3385 EXPORT_SYMBOL(unmap_extent_buffer);
3386
3387 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3388                           unsigned long start,
3389                           unsigned long len)
3390 {
3391         size_t cur;
3392         size_t offset;
3393         struct page *page;
3394         char *kaddr;
3395         char *ptr = (char *)ptrv;
3396         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3397         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3398         int ret = 0;
3399
3400         WARN_ON(start > eb->len);
3401         WARN_ON(start + len > eb->start + eb->len);
3402
3403         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3404
3405         while(len > 0) {
3406                 page = extent_buffer_page(eb, i);
3407
3408                 cur = min(len, (PAGE_CACHE_SIZE - offset));
3409
3410                 kaddr = kmap_atomic(page, KM_USER0);
3411                 ret = memcmp(ptr, kaddr + offset, cur);
3412                 kunmap_atomic(kaddr, KM_USER0);
3413                 if (ret)
3414                         break;
3415
3416                 ptr += cur;
3417                 len -= cur;
3418                 offset = 0;
3419                 i++;
3420         }
3421         return ret;
3422 }
3423 EXPORT_SYMBOL(memcmp_extent_buffer);
3424
3425 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3426                          unsigned long start, unsigned long len)
3427 {
3428         size_t cur;
3429         size_t offset;
3430         struct page *page;
3431         char *kaddr;
3432         char *src = (char *)srcv;
3433         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3434         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3435
3436         WARN_ON(start > eb->len);
3437         WARN_ON(start + len > eb->start + eb->len);
3438
3439         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3440
3441         while(len > 0) {
3442                 page = extent_buffer_page(eb, i);
3443                 WARN_ON(!PageUptodate(page));
3444
3445                 cur = min(len, PAGE_CACHE_SIZE - offset);
3446                 kaddr = kmap_atomic(page, KM_USER1);
3447                 memcpy(kaddr + offset, src, cur);
3448                 kunmap_atomic(kaddr, KM_USER1);
3449
3450                 src += cur;
3451                 len -= cur;
3452                 offset = 0;
3453                 i++;
3454         }
3455 }
3456 EXPORT_SYMBOL(write_extent_buffer);
3457
3458 void memset_extent_buffer(struct extent_buffer *eb, char c,
3459                           unsigned long start, unsigned long len)
3460 {
3461         size_t cur;
3462         size_t offset;
3463         struct page *page;
3464         char *kaddr;
3465         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3466         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3467
3468         WARN_ON(start > eb->len);
3469         WARN_ON(start + len > eb->start + eb->len);
3470
3471         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3472
3473         while(len > 0) {
3474                 page = extent_buffer_page(eb, i);
3475                 WARN_ON(!PageUptodate(page));
3476
3477                 cur = min(len, PAGE_CACHE_SIZE - offset);
3478                 kaddr = kmap_atomic(page, KM_USER0);
3479                 memset(kaddr + offset, c, cur);
3480                 kunmap_atomic(kaddr, KM_USER0);
3481
3482                 len -= cur;
3483                 offset = 0;
3484                 i++;
3485         }
3486 }
3487 EXPORT_SYMBOL(memset_extent_buffer);
3488
3489 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3490                         unsigned long dst_offset, unsigned long src_offset,
3491                         unsigned long len)
3492 {
3493         u64 dst_len = dst->len;
3494         size_t cur;
3495         size_t offset;
3496         struct page *page;
3497         char *kaddr;
3498         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3499         unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3500
3501         WARN_ON(src->len != dst_len);
3502
3503         offset = (start_offset + dst_offset) &
3504                 ((unsigned long)PAGE_CACHE_SIZE - 1);
3505
3506         while(len > 0) {
3507                 page = extent_buffer_page(dst, i);
3508                 WARN_ON(!PageUptodate(page));
3509
3510                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3511
3512                 kaddr = kmap_atomic(page, KM_USER0);
3513                 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3514                 kunmap_atomic(kaddr, KM_USER0);
3515
3516                 src_offset += cur;
3517                 len -= cur;
3518                 offset = 0;
3519                 i++;
3520         }
3521 }
3522 EXPORT_SYMBOL(copy_extent_buffer);
3523
3524 static void move_pages(struct page *dst_page, struct page *src_page,
3525                        unsigned long dst_off, unsigned long src_off,
3526                        unsigned long len)
3527 {
3528         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3529         if (dst_page == src_page) {
3530                 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3531         } else {
3532                 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3533                 char *p = dst_kaddr + dst_off + len;
3534                 char *s = src_kaddr + src_off + len;
3535
3536                 while (len--)
3537                         *--p = *--s;
3538
3539                 kunmap_atomic(src_kaddr, KM_USER1);
3540         }
3541         kunmap_atomic(dst_kaddr, KM_USER0);
3542 }
3543
3544 static void copy_pages(struct page *dst_page, struct page *src_page,
3545                        unsigned long dst_off, unsigned long src_off,
3546                        unsigned long len)
3547 {
3548         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3549         char *src_kaddr;
3550
3551         if (dst_page != src_page)
3552                 src_kaddr = kmap_atomic(src_page, KM_USER1);
3553         else
3554                 src_kaddr = dst_kaddr;
3555
3556         memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3557         kunmap_atomic(dst_kaddr, KM_USER0);
3558         if (dst_page != src_page)
3559                 kunmap_atomic(src_kaddr, KM_USER1);
3560 }
3561
3562 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3563                            unsigned long src_offset, unsigned long len)
3564 {
3565         size_t cur;
3566         size_t dst_off_in_page;
3567         size_t src_off_in_page;
3568         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3569         unsigned long dst_i;
3570         unsigned long src_i;
3571
3572         if (src_offset + len > dst->len) {
3573                 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3574                        src_offset, len, dst->len);
3575                 BUG_ON(1);
3576         }
3577         if (dst_offset + len > dst->len) {
3578                 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3579                        dst_offset, len, dst->len);
3580                 BUG_ON(1);
3581         }
3582
3583         while(len > 0) {
3584                 dst_off_in_page = (start_offset + dst_offset) &
3585                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3586                 src_off_in_page = (start_offset + src_offset) &
3587                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3588
3589                 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3590                 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3591
3592                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3593                                                src_off_in_page));
3594                 cur = min_t(unsigned long, cur,
3595                         (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3596
3597                 copy_pages(extent_buffer_page(dst, dst_i),
3598                            extent_buffer_page(dst, src_i),
3599                            dst_off_in_page, src_off_in_page, cur);
3600
3601                 src_offset += cur;
3602                 dst_offset += cur;
3603                 len -= cur;
3604         }
3605 }
3606 EXPORT_SYMBOL(memcpy_extent_buffer);
3607
3608 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3609                            unsigned long src_offset, unsigned long len)
3610 {
3611         size_t cur;
3612         size_t dst_off_in_page;
3613         size_t src_off_in_page;
3614         unsigned long dst_end = dst_offset + len - 1;
3615         unsigned long src_end = src_offset + len - 1;
3616         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3617         unsigned long dst_i;
3618         unsigned long src_i;
3619
3620         if (src_offset + len > dst->len) {
3621                 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3622                        src_offset, len, dst->len);
3623                 BUG_ON(1);
3624         }
3625         if (dst_offset + len > dst->len) {
3626                 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3627                        dst_offset, len, dst->len);
3628                 BUG_ON(1);
3629         }
3630         if (dst_offset < src_offset) {
3631                 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3632                 return;
3633         }
3634         while(len > 0) {
3635                 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3636                 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3637
3638                 dst_off_in_page = (start_offset + dst_end) &
3639                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3640                 src_off_in_page = (start_offset + src_end) &
3641                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3642
3643                 cur = min_t(unsigned long, len, src_off_in_page + 1);
3644                 cur = min(cur, dst_off_in_page + 1);
3645                 move_pages(extent_buffer_page(dst, dst_i),
3646                            extent_buffer_page(dst, src_i),
3647                            dst_off_in_page - cur + 1,
3648                            src_off_in_page - cur + 1, cur);
3649
3650                 dst_end -= cur;
3651                 src_end -= cur;
3652                 len -= cur;
3653         }
3654 }
3655 EXPORT_SYMBOL(memmove_extent_buffer);
3656
3657 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3658 {
3659         u64 start = page_offset(page);
3660         struct extent_buffer *eb;
3661         int ret = 1;
3662         unsigned long i;
3663         unsigned long num_pages;
3664
3665         spin_lock(&tree->buffer_lock);
3666         eb = buffer_search(tree, start);
3667         if (!eb)
3668                 goto out;
3669
3670         if (atomic_read(&eb->refs) > 1) {
3671                 ret = 0;
3672                 goto out;
3673         }
3674         /* at this point we can safely release the extent buffer */
3675         num_pages = num_extent_pages(eb->start, eb->len);
3676         for (i = 0; i < num_pages; i++)
3677                 page_cache_release(extent_buffer_page(eb, i));
3678         rb_erase(&eb->rb_node, &tree->buffer);
3679         __free_extent_buffer(eb);
3680 out:
3681         spin_unlock(&tree->buffer_lock);
3682         return ret;
3683 }
3684 EXPORT_SYMBOL(try_release_extent_buffer);