]> Pileus Git - ~andy/linux/blob - fs/ocfs2/alloc.c
ocfs2: Use ocfs2_extent_list instead of ocfs2_dinode.
[~andy/linux] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "aops.h"
39 #include "dlmglue.h"
40 #include "extent_map.h"
41 #include "inode.h"
42 #include "journal.h"
43 #include "localalloc.h"
44 #include "suballoc.h"
45 #include "sysfile.h"
46 #include "file.h"
47 #include "super.h"
48 #include "uptodate.h"
49
50 #include "buffer_head_io.h"
51
52 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
53 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
54                                          struct ocfs2_extent_block *eb);
55
56 /*
57  * Structures which describe a path through a btree, and functions to
58  * manipulate them.
59  *
60  * The idea here is to be as generic as possible with the tree
61  * manipulation code.
62  */
63 struct ocfs2_path_item {
64         struct buffer_head              *bh;
65         struct ocfs2_extent_list        *el;
66 };
67
68 #define OCFS2_MAX_PATH_DEPTH    5
69
70 struct ocfs2_path {
71         int                     p_tree_depth;
72         struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
73 };
74
75 #define path_root_bh(_path) ((_path)->p_node[0].bh)
76 #define path_root_el(_path) ((_path)->p_node[0].el)
77 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
78 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
79 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
80
81 /*
82  * Reset the actual path elements so that we can re-use the structure
83  * to build another path. Generally, this involves freeing the buffer
84  * heads.
85  */
86 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
87 {
88         int i, start = 0, depth = 0;
89         struct ocfs2_path_item *node;
90
91         if (keep_root)
92                 start = 1;
93
94         for(i = start; i < path_num_items(path); i++) {
95                 node = &path->p_node[i];
96
97                 brelse(node->bh);
98                 node->bh = NULL;
99                 node->el = NULL;
100         }
101
102         /*
103          * Tree depth may change during truncate, or insert. If we're
104          * keeping the root extent list, then make sure that our path
105          * structure reflects the proper depth.
106          */
107         if (keep_root)
108                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
109
110         path->p_tree_depth = depth;
111 }
112
113 static void ocfs2_free_path(struct ocfs2_path *path)
114 {
115         if (path) {
116                 ocfs2_reinit_path(path, 0);
117                 kfree(path);
118         }
119 }
120
121 /*
122  * All the elements of src into dest. After this call, src could be freed
123  * without affecting dest.
124  *
125  * Both paths should have the same root. Any non-root elements of dest
126  * will be freed.
127  */
128 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
129 {
130         int i;
131
132         BUG_ON(path_root_bh(dest) != path_root_bh(src));
133         BUG_ON(path_root_el(dest) != path_root_el(src));
134
135         ocfs2_reinit_path(dest, 1);
136
137         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
138                 dest->p_node[i].bh = src->p_node[i].bh;
139                 dest->p_node[i].el = src->p_node[i].el;
140
141                 if (dest->p_node[i].bh)
142                         get_bh(dest->p_node[i].bh);
143         }
144 }
145
146 /*
147  * Make the *dest path the same as src and re-initialize src path to
148  * have a root only.
149  */
150 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
151 {
152         int i;
153
154         BUG_ON(path_root_bh(dest) != path_root_bh(src));
155
156         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
157                 brelse(dest->p_node[i].bh);
158
159                 dest->p_node[i].bh = src->p_node[i].bh;
160                 dest->p_node[i].el = src->p_node[i].el;
161
162                 src->p_node[i].bh = NULL;
163                 src->p_node[i].el = NULL;
164         }
165 }
166
167 /*
168  * Insert an extent block at given index.
169  *
170  * This will not take an additional reference on eb_bh.
171  */
172 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
173                                         struct buffer_head *eb_bh)
174 {
175         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
176
177         /*
178          * Right now, no root bh is an extent block, so this helps
179          * catch code errors with dinode trees. The assertion can be
180          * safely removed if we ever need to insert extent block
181          * structures at the root.
182          */
183         BUG_ON(index == 0);
184
185         path->p_node[index].bh = eb_bh;
186         path->p_node[index].el = &eb->h_list;
187 }
188
189 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
190                                          struct ocfs2_extent_list *root_el)
191 {
192         struct ocfs2_path *path;
193
194         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
195
196         path = kzalloc(sizeof(*path), GFP_NOFS);
197         if (path) {
198                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
199                 get_bh(root_bh);
200                 path_root_bh(path) = root_bh;
201                 path_root_el(path) = root_el;
202         }
203
204         return path;
205 }
206
207 /*
208  * Allocate and initialize a new path based on a disk inode tree.
209  */
210 static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
211 {
212         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
213         struct ocfs2_extent_list *el = &di->id2.i_list;
214
215         return ocfs2_new_path(di_bh, el);
216 }
217
218 /*
219  * Convenience function to journal all components in a path.
220  */
221 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
222                                      struct ocfs2_path *path)
223 {
224         int i, ret = 0;
225
226         if (!path)
227                 goto out;
228
229         for(i = 0; i < path_num_items(path); i++) {
230                 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
231                                            OCFS2_JOURNAL_ACCESS_WRITE);
232                 if (ret < 0) {
233                         mlog_errno(ret);
234                         goto out;
235                 }
236         }
237
238 out:
239         return ret;
240 }
241
242 /*
243  * Return the index of the extent record which contains cluster #v_cluster.
244  * -1 is returned if it was not found.
245  *
246  * Should work fine on interior and exterior nodes.
247  */
248 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
249 {
250         int ret = -1;
251         int i;
252         struct ocfs2_extent_rec *rec;
253         u32 rec_end, rec_start, clusters;
254
255         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
256                 rec = &el->l_recs[i];
257
258                 rec_start = le32_to_cpu(rec->e_cpos);
259                 clusters = ocfs2_rec_clusters(el, rec);
260
261                 rec_end = rec_start + clusters;
262
263                 if (v_cluster >= rec_start && v_cluster < rec_end) {
264                         ret = i;
265                         break;
266                 }
267         }
268
269         return ret;
270 }
271
272 enum ocfs2_contig_type {
273         CONTIG_NONE = 0,
274         CONTIG_LEFT,
275         CONTIG_RIGHT,
276         CONTIG_LEFTRIGHT,
277 };
278
279
280 /*
281  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
282  * ocfs2_extent_contig only work properly against leaf nodes!
283  */
284 static int ocfs2_block_extent_contig(struct super_block *sb,
285                                      struct ocfs2_extent_rec *ext,
286                                      u64 blkno)
287 {
288         u64 blk_end = le64_to_cpu(ext->e_blkno);
289
290         blk_end += ocfs2_clusters_to_blocks(sb,
291                                     le16_to_cpu(ext->e_leaf_clusters));
292
293         return blkno == blk_end;
294 }
295
296 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
297                                   struct ocfs2_extent_rec *right)
298 {
299         u32 left_range;
300
301         left_range = le32_to_cpu(left->e_cpos) +
302                 le16_to_cpu(left->e_leaf_clusters);
303
304         return (left_range == le32_to_cpu(right->e_cpos));
305 }
306
307 static enum ocfs2_contig_type
308         ocfs2_extent_contig(struct inode *inode,
309                             struct ocfs2_extent_rec *ext,
310                             struct ocfs2_extent_rec *insert_rec)
311 {
312         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
313
314         /*
315          * Refuse to coalesce extent records with different flag
316          * fields - we don't want to mix unwritten extents with user
317          * data.
318          */
319         if (ext->e_flags != insert_rec->e_flags)
320                 return CONTIG_NONE;
321
322         if (ocfs2_extents_adjacent(ext, insert_rec) &&
323             ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
324                         return CONTIG_RIGHT;
325
326         blkno = le64_to_cpu(ext->e_blkno);
327         if (ocfs2_extents_adjacent(insert_rec, ext) &&
328             ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
329                 return CONTIG_LEFT;
330
331         return CONTIG_NONE;
332 }
333
334 /*
335  * NOTE: We can have pretty much any combination of contiguousness and
336  * appending.
337  *
338  * The usefulness of APPEND_TAIL is more in that it lets us know that
339  * we'll have to update the path to that leaf.
340  */
341 enum ocfs2_append_type {
342         APPEND_NONE = 0,
343         APPEND_TAIL,
344 };
345
346 enum ocfs2_split_type {
347         SPLIT_NONE = 0,
348         SPLIT_LEFT,
349         SPLIT_RIGHT,
350 };
351
352 struct ocfs2_insert_type {
353         enum ocfs2_split_type   ins_split;
354         enum ocfs2_append_type  ins_appending;
355         enum ocfs2_contig_type  ins_contig;
356         int                     ins_contig_index;
357         int                     ins_tree_depth;
358 };
359
360 struct ocfs2_merge_ctxt {
361         enum ocfs2_contig_type  c_contig_type;
362         int                     c_has_empty_extent;
363         int                     c_split_covers_rec;
364 };
365
366 /*
367  * How many free extents have we got before we need more meta data?
368  */
369 int ocfs2_num_free_extents(struct ocfs2_super *osb,
370                            struct inode *inode,
371                            struct buffer_head *bh)
372 {
373         int retval;
374         struct ocfs2_extent_list *el;
375         struct ocfs2_extent_block *eb;
376         struct buffer_head *eb_bh = NULL;
377         struct ocfs2_dinode *fe = (struct ocfs2_dinode *)bh->b_data;
378
379         mlog_entry_void();
380
381         if (!OCFS2_IS_VALID_DINODE(fe)) {
382                 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
383                 retval = -EIO;
384                 goto bail;
385         }
386
387         if (fe->i_last_eb_blk) {
388                 retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
389                                           &eb_bh, OCFS2_BH_CACHED, inode);
390                 if (retval < 0) {
391                         mlog_errno(retval);
392                         goto bail;
393                 }
394                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
395                 el = &eb->h_list;
396         } else
397                 el = &fe->id2.i_list;
398
399         BUG_ON(el->l_tree_depth != 0);
400
401         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
402 bail:
403         if (eb_bh)
404                 brelse(eb_bh);
405
406         mlog_exit(retval);
407         return retval;
408 }
409
410 /* expects array to already be allocated
411  *
412  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
413  * l_count for you
414  */
415 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
416                                      handle_t *handle,
417                                      struct inode *inode,
418                                      int wanted,
419                                      struct ocfs2_alloc_context *meta_ac,
420                                      struct buffer_head *bhs[])
421 {
422         int count, status, i;
423         u16 suballoc_bit_start;
424         u32 num_got;
425         u64 first_blkno;
426         struct ocfs2_extent_block *eb;
427
428         mlog_entry_void();
429
430         count = 0;
431         while (count < wanted) {
432                 status = ocfs2_claim_metadata(osb,
433                                               handle,
434                                               meta_ac,
435                                               wanted - count,
436                                               &suballoc_bit_start,
437                                               &num_got,
438                                               &first_blkno);
439                 if (status < 0) {
440                         mlog_errno(status);
441                         goto bail;
442                 }
443
444                 for(i = count;  i < (num_got + count); i++) {
445                         bhs[i] = sb_getblk(osb->sb, first_blkno);
446                         if (bhs[i] == NULL) {
447                                 status = -EIO;
448                                 mlog_errno(status);
449                                 goto bail;
450                         }
451                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
452
453                         status = ocfs2_journal_access(handle, inode, bhs[i],
454                                                       OCFS2_JOURNAL_ACCESS_CREATE);
455                         if (status < 0) {
456                                 mlog_errno(status);
457                                 goto bail;
458                         }
459
460                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
461                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
462                         /* Ok, setup the minimal stuff here. */
463                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
464                         eb->h_blkno = cpu_to_le64(first_blkno);
465                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
466                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
467                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
468                         eb->h_list.l_count =
469                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
470
471                         suballoc_bit_start++;
472                         first_blkno++;
473
474                         /* We'll also be dirtied by the caller, so
475                          * this isn't absolutely necessary. */
476                         status = ocfs2_journal_dirty(handle, bhs[i]);
477                         if (status < 0) {
478                                 mlog_errno(status);
479                                 goto bail;
480                         }
481                 }
482
483                 count += num_got;
484         }
485
486         status = 0;
487 bail:
488         if (status < 0) {
489                 for(i = 0; i < wanted; i++) {
490                         if (bhs[i])
491                                 brelse(bhs[i]);
492                         bhs[i] = NULL;
493                 }
494         }
495         mlog_exit(status);
496         return status;
497 }
498
499 /*
500  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
501  *
502  * Returns the sum of the rightmost extent rec logical offset and
503  * cluster count.
504  *
505  * ocfs2_add_branch() uses this to determine what logical cluster
506  * value should be populated into the leftmost new branch records.
507  *
508  * ocfs2_shift_tree_depth() uses this to determine the # clusters
509  * value for the new topmost tree record.
510  */
511 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
512 {
513         int i;
514
515         i = le16_to_cpu(el->l_next_free_rec) - 1;
516
517         return le32_to_cpu(el->l_recs[i].e_cpos) +
518                 ocfs2_rec_clusters(el, &el->l_recs[i]);
519 }
520
521 /*
522  * Add an entire tree branch to our inode. eb_bh is the extent block
523  * to start at, if we don't want to start the branch at the dinode
524  * structure.
525  *
526  * last_eb_bh is required as we have to update it's next_leaf pointer
527  * for the new last extent block.
528  *
529  * the new branch will be 'empty' in the sense that every block will
530  * contain a single record with cluster count == 0.
531  */
532 static int ocfs2_add_branch(struct ocfs2_super *osb,
533                             handle_t *handle,
534                             struct inode *inode,
535                             struct buffer_head *fe_bh,
536                             struct buffer_head *eb_bh,
537                             struct buffer_head **last_eb_bh,
538                             struct ocfs2_alloc_context *meta_ac)
539 {
540         int status, new_blocks, i;
541         u64 next_blkno, new_last_eb_blk;
542         struct buffer_head *bh;
543         struct buffer_head **new_eb_bhs = NULL;
544         struct ocfs2_dinode *fe;
545         struct ocfs2_extent_block *eb;
546         struct ocfs2_extent_list  *eb_el;
547         struct ocfs2_extent_list  *el;
548         u32 new_cpos;
549
550         mlog_entry_void();
551
552         BUG_ON(!last_eb_bh || !*last_eb_bh);
553
554         fe = (struct ocfs2_dinode *) fe_bh->b_data;
555
556         if (eb_bh) {
557                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
558                 el = &eb->h_list;
559         } else
560                 el = &fe->id2.i_list;
561
562         /* we never add a branch to a leaf. */
563         BUG_ON(!el->l_tree_depth);
564
565         new_blocks = le16_to_cpu(el->l_tree_depth);
566
567         /* allocate the number of new eb blocks we need */
568         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
569                              GFP_KERNEL);
570         if (!new_eb_bhs) {
571                 status = -ENOMEM;
572                 mlog_errno(status);
573                 goto bail;
574         }
575
576         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
577                                            meta_ac, new_eb_bhs);
578         if (status < 0) {
579                 mlog_errno(status);
580                 goto bail;
581         }
582
583         eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
584         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
585
586         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
587          * linked with the rest of the tree.
588          * conversly, new_eb_bhs[0] is the new bottommost leaf.
589          *
590          * when we leave the loop, new_last_eb_blk will point to the
591          * newest leaf, and next_blkno will point to the topmost extent
592          * block. */
593         next_blkno = new_last_eb_blk = 0;
594         for(i = 0; i < new_blocks; i++) {
595                 bh = new_eb_bhs[i];
596                 eb = (struct ocfs2_extent_block *) bh->b_data;
597                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
598                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
599                         status = -EIO;
600                         goto bail;
601                 }
602                 eb_el = &eb->h_list;
603
604                 status = ocfs2_journal_access(handle, inode, bh,
605                                               OCFS2_JOURNAL_ACCESS_CREATE);
606                 if (status < 0) {
607                         mlog_errno(status);
608                         goto bail;
609                 }
610
611                 eb->h_next_leaf_blk = 0;
612                 eb_el->l_tree_depth = cpu_to_le16(i);
613                 eb_el->l_next_free_rec = cpu_to_le16(1);
614                 /*
615                  * This actually counts as an empty extent as
616                  * c_clusters == 0
617                  */
618                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
619                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
620                 /*
621                  * eb_el isn't always an interior node, but even leaf
622                  * nodes want a zero'd flags and reserved field so
623                  * this gets the whole 32 bits regardless of use.
624                  */
625                 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
626                 if (!eb_el->l_tree_depth)
627                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
628
629                 status = ocfs2_journal_dirty(handle, bh);
630                 if (status < 0) {
631                         mlog_errno(status);
632                         goto bail;
633                 }
634
635                 next_blkno = le64_to_cpu(eb->h_blkno);
636         }
637
638         /* This is a bit hairy. We want to update up to three blocks
639          * here without leaving any of them in an inconsistent state
640          * in case of error. We don't have to worry about
641          * journal_dirty erroring as it won't unless we've aborted the
642          * handle (in which case we would never be here) so reserving
643          * the write with journal_access is all we need to do. */
644         status = ocfs2_journal_access(handle, inode, *last_eb_bh,
645                                       OCFS2_JOURNAL_ACCESS_WRITE);
646         if (status < 0) {
647                 mlog_errno(status);
648                 goto bail;
649         }
650         status = ocfs2_journal_access(handle, inode, fe_bh,
651                                       OCFS2_JOURNAL_ACCESS_WRITE);
652         if (status < 0) {
653                 mlog_errno(status);
654                 goto bail;
655         }
656         if (eb_bh) {
657                 status = ocfs2_journal_access(handle, inode, eb_bh,
658                                               OCFS2_JOURNAL_ACCESS_WRITE);
659                 if (status < 0) {
660                         mlog_errno(status);
661                         goto bail;
662                 }
663         }
664
665         /* Link the new branch into the rest of the tree (el will
666          * either be on the fe, or the extent block passed in. */
667         i = le16_to_cpu(el->l_next_free_rec);
668         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
669         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
670         el->l_recs[i].e_int_clusters = 0;
671         le16_add_cpu(&el->l_next_free_rec, 1);
672
673         /* fe needs a new last extent block pointer, as does the
674          * next_leaf on the previously last-extent-block. */
675         fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
676
677         eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
678         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
679
680         status = ocfs2_journal_dirty(handle, *last_eb_bh);
681         if (status < 0)
682                 mlog_errno(status);
683         status = ocfs2_journal_dirty(handle, fe_bh);
684         if (status < 0)
685                 mlog_errno(status);
686         if (eb_bh) {
687                 status = ocfs2_journal_dirty(handle, eb_bh);
688                 if (status < 0)
689                         mlog_errno(status);
690         }
691
692         /*
693          * Some callers want to track the rightmost leaf so pass it
694          * back here.
695          */
696         brelse(*last_eb_bh);
697         get_bh(new_eb_bhs[0]);
698         *last_eb_bh = new_eb_bhs[0];
699
700         status = 0;
701 bail:
702         if (new_eb_bhs) {
703                 for (i = 0; i < new_blocks; i++)
704                         if (new_eb_bhs[i])
705                                 brelse(new_eb_bhs[i]);
706                 kfree(new_eb_bhs);
707         }
708
709         mlog_exit(status);
710         return status;
711 }
712
713 /*
714  * adds another level to the allocation tree.
715  * returns back the new extent block so you can add a branch to it
716  * after this call.
717  */
718 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
719                                   handle_t *handle,
720                                   struct inode *inode,
721                                   struct buffer_head *fe_bh,
722                                   struct ocfs2_alloc_context *meta_ac,
723                                   struct buffer_head **ret_new_eb_bh)
724 {
725         int status, i;
726         u32 new_clusters;
727         struct buffer_head *new_eb_bh = NULL;
728         struct ocfs2_dinode *fe;
729         struct ocfs2_extent_block *eb;
730         struct ocfs2_extent_list  *fe_el;
731         struct ocfs2_extent_list  *eb_el;
732
733         mlog_entry_void();
734
735         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
736                                            &new_eb_bh);
737         if (status < 0) {
738                 mlog_errno(status);
739                 goto bail;
740         }
741
742         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
743         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
744                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
745                 status = -EIO;
746                 goto bail;
747         }
748
749         eb_el = &eb->h_list;
750         fe = (struct ocfs2_dinode *) fe_bh->b_data;
751         fe_el = &fe->id2.i_list;
752
753         status = ocfs2_journal_access(handle, inode, new_eb_bh,
754                                       OCFS2_JOURNAL_ACCESS_CREATE);
755         if (status < 0) {
756                 mlog_errno(status);
757                 goto bail;
758         }
759
760         /* copy the fe data into the new extent block */
761         eb_el->l_tree_depth = fe_el->l_tree_depth;
762         eb_el->l_next_free_rec = fe_el->l_next_free_rec;
763         for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
764                 eb_el->l_recs[i] = fe_el->l_recs[i];
765
766         status = ocfs2_journal_dirty(handle, new_eb_bh);
767         if (status < 0) {
768                 mlog_errno(status);
769                 goto bail;
770         }
771
772         status = ocfs2_journal_access(handle, inode, fe_bh,
773                                       OCFS2_JOURNAL_ACCESS_WRITE);
774         if (status < 0) {
775                 mlog_errno(status);
776                 goto bail;
777         }
778
779         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
780
781         /* update fe now */
782         le16_add_cpu(&fe_el->l_tree_depth, 1);
783         fe_el->l_recs[0].e_cpos = 0;
784         fe_el->l_recs[0].e_blkno = eb->h_blkno;
785         fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
786         for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
787                 memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
788         fe_el->l_next_free_rec = cpu_to_le16(1);
789
790         /* If this is our 1st tree depth shift, then last_eb_blk
791          * becomes the allocated extent block */
792         if (fe_el->l_tree_depth == cpu_to_le16(1))
793                 fe->i_last_eb_blk = eb->h_blkno;
794
795         status = ocfs2_journal_dirty(handle, fe_bh);
796         if (status < 0) {
797                 mlog_errno(status);
798                 goto bail;
799         }
800
801         *ret_new_eb_bh = new_eb_bh;
802         new_eb_bh = NULL;
803         status = 0;
804 bail:
805         if (new_eb_bh)
806                 brelse(new_eb_bh);
807
808         mlog_exit(status);
809         return status;
810 }
811
812 /*
813  * Should only be called when there is no space left in any of the
814  * leaf nodes. What we want to do is find the lowest tree depth
815  * non-leaf extent block with room for new records. There are three
816  * valid results of this search:
817  *
818  * 1) a lowest extent block is found, then we pass it back in
819  *    *lowest_eb_bh and return '0'
820  *
821  * 2) the search fails to find anything, but the dinode has room. We
822  *    pass NULL back in *lowest_eb_bh, but still return '0'
823  *
824  * 3) the search fails to find anything AND the dinode is full, in
825  *    which case we return > 0
826  *
827  * return status < 0 indicates an error.
828  */
829 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
830                                     struct inode *inode,
831                                     struct buffer_head *fe_bh,
832                                     struct buffer_head **target_bh)
833 {
834         int status = 0, i;
835         u64 blkno;
836         struct ocfs2_dinode *fe;
837         struct ocfs2_extent_block *eb;
838         struct ocfs2_extent_list  *el;
839         struct buffer_head *bh = NULL;
840         struct buffer_head *lowest_bh = NULL;
841
842         mlog_entry_void();
843
844         *target_bh = NULL;
845
846         fe = (struct ocfs2_dinode *) fe_bh->b_data;
847         el = &fe->id2.i_list;
848
849         while(le16_to_cpu(el->l_tree_depth) > 1) {
850                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
851                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
852                                     "extent list (next_free_rec == 0)",
853                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
854                         status = -EIO;
855                         goto bail;
856                 }
857                 i = le16_to_cpu(el->l_next_free_rec) - 1;
858                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
859                 if (!blkno) {
860                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
861                                     "list where extent # %d has no physical "
862                                     "block start",
863                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
864                         status = -EIO;
865                         goto bail;
866                 }
867
868                 if (bh) {
869                         brelse(bh);
870                         bh = NULL;
871                 }
872
873                 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
874                                           inode);
875                 if (status < 0) {
876                         mlog_errno(status);
877                         goto bail;
878                 }
879
880                 eb = (struct ocfs2_extent_block *) bh->b_data;
881                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
882                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
883                         status = -EIO;
884                         goto bail;
885                 }
886                 el = &eb->h_list;
887
888                 if (le16_to_cpu(el->l_next_free_rec) <
889                     le16_to_cpu(el->l_count)) {
890                         if (lowest_bh)
891                                 brelse(lowest_bh);
892                         lowest_bh = bh;
893                         get_bh(lowest_bh);
894                 }
895         }
896
897         /* If we didn't find one and the fe doesn't have any room,
898          * then return '1' */
899         if (!lowest_bh
900             && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
901                 status = 1;
902
903         *target_bh = lowest_bh;
904 bail:
905         if (bh)
906                 brelse(bh);
907
908         mlog_exit(status);
909         return status;
910 }
911
912 /*
913  * Grow a b-tree so that it has more records.
914  *
915  * We might shift the tree depth in which case existing paths should
916  * be considered invalid.
917  *
918  * Tree depth after the grow is returned via *final_depth.
919  *
920  * *last_eb_bh will be updated by ocfs2_add_branch().
921  */
922 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
923                            struct buffer_head *di_bh, int *final_depth,
924                            struct buffer_head **last_eb_bh,
925                            struct ocfs2_alloc_context *meta_ac)
926 {
927         int ret, shift;
928         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
929         int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
930         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
931         struct buffer_head *bh = NULL;
932
933         BUG_ON(meta_ac == NULL);
934
935         shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
936         if (shift < 0) {
937                 ret = shift;
938                 mlog_errno(ret);
939                 goto out;
940         }
941
942         /* We traveled all the way to the bottom of the allocation tree
943          * and didn't find room for any more extents - we need to add
944          * another tree level */
945         if (shift) {
946                 BUG_ON(bh);
947                 mlog(0, "need to shift tree depth (current = %d)\n", depth);
948
949                 /* ocfs2_shift_tree_depth will return us a buffer with
950                  * the new extent block (so we can pass that to
951                  * ocfs2_add_branch). */
952                 ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
953                                              meta_ac, &bh);
954                 if (ret < 0) {
955                         mlog_errno(ret);
956                         goto out;
957                 }
958                 depth++;
959                 if (depth == 1) {
960                         /*
961                          * Special case: we have room now if we shifted from
962                          * tree_depth 0, so no more work needs to be done.
963                          *
964                          * We won't be calling add_branch, so pass
965                          * back *last_eb_bh as the new leaf. At depth
966                          * zero, it should always be null so there's
967                          * no reason to brelse.
968                          */
969                         BUG_ON(*last_eb_bh);
970                         get_bh(bh);
971                         *last_eb_bh = bh;
972                         goto out;
973                 }
974         }
975
976         /* call ocfs2_add_branch to add the final part of the tree with
977          * the new data. */
978         mlog(0, "add branch. bh = %p\n", bh);
979         ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
980                                meta_ac);
981         if (ret < 0) {
982                 mlog_errno(ret);
983                 goto out;
984         }
985
986 out:
987         if (final_depth)
988                 *final_depth = depth;
989         brelse(bh);
990         return ret;
991 }
992
993 /*
994  * This function will discard the rightmost extent record.
995  */
996 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
997 {
998         int next_free = le16_to_cpu(el->l_next_free_rec);
999         int count = le16_to_cpu(el->l_count);
1000         unsigned int num_bytes;
1001
1002         BUG_ON(!next_free);
1003         /* This will cause us to go off the end of our extent list. */
1004         BUG_ON(next_free >= count);
1005
1006         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1007
1008         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1009 }
1010
1011 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1012                               struct ocfs2_extent_rec *insert_rec)
1013 {
1014         int i, insert_index, next_free, has_empty, num_bytes;
1015         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1016         struct ocfs2_extent_rec *rec;
1017
1018         next_free = le16_to_cpu(el->l_next_free_rec);
1019         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1020
1021         BUG_ON(!next_free);
1022
1023         /* The tree code before us didn't allow enough room in the leaf. */
1024         BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1025
1026         /*
1027          * The easiest way to approach this is to just remove the
1028          * empty extent and temporarily decrement next_free.
1029          */
1030         if (has_empty) {
1031                 /*
1032                  * If next_free was 1 (only an empty extent), this
1033                  * loop won't execute, which is fine. We still want
1034                  * the decrement above to happen.
1035                  */
1036                 for(i = 0; i < (next_free - 1); i++)
1037                         el->l_recs[i] = el->l_recs[i+1];
1038
1039                 next_free--;
1040         }
1041
1042         /*
1043          * Figure out what the new record index should be.
1044          */
1045         for(i = 0; i < next_free; i++) {
1046                 rec = &el->l_recs[i];
1047
1048                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
1049                         break;
1050         }
1051         insert_index = i;
1052
1053         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1054              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1055
1056         BUG_ON(insert_index < 0);
1057         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1058         BUG_ON(insert_index > next_free);
1059
1060         /*
1061          * No need to memmove if we're just adding to the tail.
1062          */
1063         if (insert_index != next_free) {
1064                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
1065
1066                 num_bytes = next_free - insert_index;
1067                 num_bytes *= sizeof(struct ocfs2_extent_rec);
1068                 memmove(&el->l_recs[insert_index + 1],
1069                         &el->l_recs[insert_index],
1070                         num_bytes);
1071         }
1072
1073         /*
1074          * Either we had an empty extent, and need to re-increment or
1075          * there was no empty extent on a non full rightmost leaf node,
1076          * in which case we still need to increment.
1077          */
1078         next_free++;
1079         el->l_next_free_rec = cpu_to_le16(next_free);
1080         /*
1081          * Make sure none of the math above just messed up our tree.
1082          */
1083         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1084
1085         el->l_recs[insert_index] = *insert_rec;
1086
1087 }
1088
1089 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1090 {
1091         int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1092
1093         BUG_ON(num_recs == 0);
1094
1095         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1096                 num_recs--;
1097                 size = num_recs * sizeof(struct ocfs2_extent_rec);
1098                 memmove(&el->l_recs[0], &el->l_recs[1], size);
1099                 memset(&el->l_recs[num_recs], 0,
1100                        sizeof(struct ocfs2_extent_rec));
1101                 el->l_next_free_rec = cpu_to_le16(num_recs);
1102         }
1103 }
1104
1105 /*
1106  * Create an empty extent record .
1107  *
1108  * l_next_free_rec may be updated.
1109  *
1110  * If an empty extent already exists do nothing.
1111  */
1112 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1113 {
1114         int next_free = le16_to_cpu(el->l_next_free_rec);
1115
1116         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1117
1118         if (next_free == 0)
1119                 goto set_and_inc;
1120
1121         if (ocfs2_is_empty_extent(&el->l_recs[0]))
1122                 return;
1123
1124         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1125                         "Asked to create an empty extent in a full list:\n"
1126                         "count = %u, tree depth = %u",
1127                         le16_to_cpu(el->l_count),
1128                         le16_to_cpu(el->l_tree_depth));
1129
1130         ocfs2_shift_records_right(el);
1131
1132 set_and_inc:
1133         le16_add_cpu(&el->l_next_free_rec, 1);
1134         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1135 }
1136
1137 /*
1138  * For a rotation which involves two leaf nodes, the "root node" is
1139  * the lowest level tree node which contains a path to both leafs. This
1140  * resulting set of information can be used to form a complete "subtree"
1141  *
1142  * This function is passed two full paths from the dinode down to a
1143  * pair of adjacent leaves. It's task is to figure out which path
1144  * index contains the subtree root - this can be the root index itself
1145  * in a worst-case rotation.
1146  *
1147  * The array index of the subtree root is passed back.
1148  */
1149 static int ocfs2_find_subtree_root(struct inode *inode,
1150                                    struct ocfs2_path *left,
1151                                    struct ocfs2_path *right)
1152 {
1153         int i = 0;
1154
1155         /*
1156          * Check that the caller passed in two paths from the same tree.
1157          */
1158         BUG_ON(path_root_bh(left) != path_root_bh(right));
1159
1160         do {
1161                 i++;
1162
1163                 /*
1164                  * The caller didn't pass two adjacent paths.
1165                  */
1166                 mlog_bug_on_msg(i > left->p_tree_depth,
1167                                 "Inode %lu, left depth %u, right depth %u\n"
1168                                 "left leaf blk %llu, right leaf blk %llu\n",
1169                                 inode->i_ino, left->p_tree_depth,
1170                                 right->p_tree_depth,
1171                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1172                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1173         } while (left->p_node[i].bh->b_blocknr ==
1174                  right->p_node[i].bh->b_blocknr);
1175
1176         return i - 1;
1177 }
1178
1179 typedef void (path_insert_t)(void *, struct buffer_head *);
1180
1181 /*
1182  * Traverse a btree path in search of cpos, starting at root_el.
1183  *
1184  * This code can be called with a cpos larger than the tree, in which
1185  * case it will return the rightmost path.
1186  */
1187 static int __ocfs2_find_path(struct inode *inode,
1188                              struct ocfs2_extent_list *root_el, u32 cpos,
1189                              path_insert_t *func, void *data)
1190 {
1191         int i, ret = 0;
1192         u32 range;
1193         u64 blkno;
1194         struct buffer_head *bh = NULL;
1195         struct ocfs2_extent_block *eb;
1196         struct ocfs2_extent_list *el;
1197         struct ocfs2_extent_rec *rec;
1198         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1199
1200         el = root_el;
1201         while (el->l_tree_depth) {
1202                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1203                         ocfs2_error(inode->i_sb,
1204                                     "Inode %llu has empty extent list at "
1205                                     "depth %u\n",
1206                                     (unsigned long long)oi->ip_blkno,
1207                                     le16_to_cpu(el->l_tree_depth));
1208                         ret = -EROFS;
1209                         goto out;
1210
1211                 }
1212
1213                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1214                         rec = &el->l_recs[i];
1215
1216                         /*
1217                          * In the case that cpos is off the allocation
1218                          * tree, this should just wind up returning the
1219                          * rightmost record.
1220                          */
1221                         range = le32_to_cpu(rec->e_cpos) +
1222                                 ocfs2_rec_clusters(el, rec);
1223                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1224                             break;
1225                 }
1226
1227                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1228                 if (blkno == 0) {
1229                         ocfs2_error(inode->i_sb,
1230                                     "Inode %llu has bad blkno in extent list "
1231                                     "at depth %u (index %d)\n",
1232                                     (unsigned long long)oi->ip_blkno,
1233                                     le16_to_cpu(el->l_tree_depth), i);
1234                         ret = -EROFS;
1235                         goto out;
1236                 }
1237
1238                 brelse(bh);
1239                 bh = NULL;
1240                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1241                                        &bh, OCFS2_BH_CACHED, inode);
1242                 if (ret) {
1243                         mlog_errno(ret);
1244                         goto out;
1245                 }
1246
1247                 eb = (struct ocfs2_extent_block *) bh->b_data;
1248                 el = &eb->h_list;
1249                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1250                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1251                         ret = -EIO;
1252                         goto out;
1253                 }
1254
1255                 if (le16_to_cpu(el->l_next_free_rec) >
1256                     le16_to_cpu(el->l_count)) {
1257                         ocfs2_error(inode->i_sb,
1258                                     "Inode %llu has bad count in extent list "
1259                                     "at block %llu (next free=%u, count=%u)\n",
1260                                     (unsigned long long)oi->ip_blkno,
1261                                     (unsigned long long)bh->b_blocknr,
1262                                     le16_to_cpu(el->l_next_free_rec),
1263                                     le16_to_cpu(el->l_count));
1264                         ret = -EROFS;
1265                         goto out;
1266                 }
1267
1268                 if (func)
1269                         func(data, bh);
1270         }
1271
1272 out:
1273         /*
1274          * Catch any trailing bh that the loop didn't handle.
1275          */
1276         brelse(bh);
1277
1278         return ret;
1279 }
1280
1281 /*
1282  * Given an initialized path (that is, it has a valid root extent
1283  * list), this function will traverse the btree in search of the path
1284  * which would contain cpos.
1285  *
1286  * The path traveled is recorded in the path structure.
1287  *
1288  * Note that this will not do any comparisons on leaf node extent
1289  * records, so it will work fine in the case that we just added a tree
1290  * branch.
1291  */
1292 struct find_path_data {
1293         int index;
1294         struct ocfs2_path *path;
1295 };
1296 static void find_path_ins(void *data, struct buffer_head *bh)
1297 {
1298         struct find_path_data *fp = data;
1299
1300         get_bh(bh);
1301         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1302         fp->index++;
1303 }
1304 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1305                            u32 cpos)
1306 {
1307         struct find_path_data data;
1308
1309         data.index = 1;
1310         data.path = path;
1311         return __ocfs2_find_path(inode, path_root_el(path), cpos,
1312                                  find_path_ins, &data);
1313 }
1314
1315 static void find_leaf_ins(void *data, struct buffer_head *bh)
1316 {
1317         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1318         struct ocfs2_extent_list *el = &eb->h_list;
1319         struct buffer_head **ret = data;
1320
1321         /* We want to retain only the leaf block. */
1322         if (le16_to_cpu(el->l_tree_depth) == 0) {
1323                 get_bh(bh);
1324                 *ret = bh;
1325         }
1326 }
1327 /*
1328  * Find the leaf block in the tree which would contain cpos. No
1329  * checking of the actual leaf is done.
1330  *
1331  * Some paths want to call this instead of allocating a path structure
1332  * and calling ocfs2_find_path().
1333  *
1334  * This function doesn't handle non btree extent lists.
1335  */
1336 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1337                     u32 cpos, struct buffer_head **leaf_bh)
1338 {
1339         int ret;
1340         struct buffer_head *bh = NULL;
1341
1342         ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1343         if (ret) {
1344                 mlog_errno(ret);
1345                 goto out;
1346         }
1347
1348         *leaf_bh = bh;
1349 out:
1350         return ret;
1351 }
1352
1353 /*
1354  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1355  *
1356  * Basically, we've moved stuff around at the bottom of the tree and
1357  * we need to fix up the extent records above the changes to reflect
1358  * the new changes.
1359  *
1360  * left_rec: the record on the left.
1361  * left_child_el: is the child list pointed to by left_rec
1362  * right_rec: the record to the right of left_rec
1363  * right_child_el: is the child list pointed to by right_rec
1364  *
1365  * By definition, this only works on interior nodes.
1366  */
1367 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1368                                   struct ocfs2_extent_list *left_child_el,
1369                                   struct ocfs2_extent_rec *right_rec,
1370                                   struct ocfs2_extent_list *right_child_el)
1371 {
1372         u32 left_clusters, right_end;
1373
1374         /*
1375          * Interior nodes never have holes. Their cpos is the cpos of
1376          * the leftmost record in their child list. Their cluster
1377          * count covers the full theoretical range of their child list
1378          * - the range between their cpos and the cpos of the record
1379          * immediately to their right.
1380          */
1381         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1382         if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1383                 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1384                 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1385         }
1386         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1387         left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1388
1389         /*
1390          * Calculate the rightmost cluster count boundary before
1391          * moving cpos - we will need to adjust clusters after
1392          * updating e_cpos to keep the same highest cluster count.
1393          */
1394         right_end = le32_to_cpu(right_rec->e_cpos);
1395         right_end += le32_to_cpu(right_rec->e_int_clusters);
1396
1397         right_rec->e_cpos = left_rec->e_cpos;
1398         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1399
1400         right_end -= le32_to_cpu(right_rec->e_cpos);
1401         right_rec->e_int_clusters = cpu_to_le32(right_end);
1402 }
1403
1404 /*
1405  * Adjust the adjacent root node records involved in a
1406  * rotation. left_el_blkno is passed in as a key so that we can easily
1407  * find it's index in the root list.
1408  */
1409 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1410                                       struct ocfs2_extent_list *left_el,
1411                                       struct ocfs2_extent_list *right_el,
1412                                       u64 left_el_blkno)
1413 {
1414         int i;
1415
1416         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1417                le16_to_cpu(left_el->l_tree_depth));
1418
1419         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1420                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1421                         break;
1422         }
1423
1424         /*
1425          * The path walking code should have never returned a root and
1426          * two paths which are not adjacent.
1427          */
1428         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1429
1430         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1431                                       &root_el->l_recs[i + 1], right_el);
1432 }
1433
1434 /*
1435  * We've changed a leaf block (in right_path) and need to reflect that
1436  * change back up the subtree.
1437  *
1438  * This happens in multiple places:
1439  *   - When we've moved an extent record from the left path leaf to the right
1440  *     path leaf to make room for an empty extent in the left path leaf.
1441  *   - When our insert into the right path leaf is at the leftmost edge
1442  *     and requires an update of the path immediately to it's left. This
1443  *     can occur at the end of some types of rotation and appending inserts.
1444  *   - When we've adjusted the last extent record in the left path leaf and the
1445  *     1st extent record in the right path leaf during cross extent block merge.
1446  */
1447 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1448                                        struct ocfs2_path *left_path,
1449                                        struct ocfs2_path *right_path,
1450                                        int subtree_index)
1451 {
1452         int ret, i, idx;
1453         struct ocfs2_extent_list *el, *left_el, *right_el;
1454         struct ocfs2_extent_rec *left_rec, *right_rec;
1455         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1456
1457         /*
1458          * Update the counts and position values within all the
1459          * interior nodes to reflect the leaf rotation we just did.
1460          *
1461          * The root node is handled below the loop.
1462          *
1463          * We begin the loop with right_el and left_el pointing to the
1464          * leaf lists and work our way up.
1465          *
1466          * NOTE: within this loop, left_el and right_el always refer
1467          * to the *child* lists.
1468          */
1469         left_el = path_leaf_el(left_path);
1470         right_el = path_leaf_el(right_path);
1471         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1472                 mlog(0, "Adjust records at index %u\n", i);
1473
1474                 /*
1475                  * One nice property of knowing that all of these
1476                  * nodes are below the root is that we only deal with
1477                  * the leftmost right node record and the rightmost
1478                  * left node record.
1479                  */
1480                 el = left_path->p_node[i].el;
1481                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1482                 left_rec = &el->l_recs[idx];
1483
1484                 el = right_path->p_node[i].el;
1485                 right_rec = &el->l_recs[0];
1486
1487                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1488                                               right_el);
1489
1490                 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1491                 if (ret)
1492                         mlog_errno(ret);
1493
1494                 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1495                 if (ret)
1496                         mlog_errno(ret);
1497
1498                 /*
1499                  * Setup our list pointers now so that the current
1500                  * parents become children in the next iteration.
1501                  */
1502                 left_el = left_path->p_node[i].el;
1503                 right_el = right_path->p_node[i].el;
1504         }
1505
1506         /*
1507          * At the root node, adjust the two adjacent records which
1508          * begin our path to the leaves.
1509          */
1510
1511         el = left_path->p_node[subtree_index].el;
1512         left_el = left_path->p_node[subtree_index + 1].el;
1513         right_el = right_path->p_node[subtree_index + 1].el;
1514
1515         ocfs2_adjust_root_records(el, left_el, right_el,
1516                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
1517
1518         root_bh = left_path->p_node[subtree_index].bh;
1519
1520         ret = ocfs2_journal_dirty(handle, root_bh);
1521         if (ret)
1522                 mlog_errno(ret);
1523 }
1524
1525 static int ocfs2_rotate_subtree_right(struct inode *inode,
1526                                       handle_t *handle,
1527                                       struct ocfs2_path *left_path,
1528                                       struct ocfs2_path *right_path,
1529                                       int subtree_index)
1530 {
1531         int ret, i;
1532         struct buffer_head *right_leaf_bh;
1533         struct buffer_head *left_leaf_bh = NULL;
1534         struct buffer_head *root_bh;
1535         struct ocfs2_extent_list *right_el, *left_el;
1536         struct ocfs2_extent_rec move_rec;
1537
1538         left_leaf_bh = path_leaf_bh(left_path);
1539         left_el = path_leaf_el(left_path);
1540
1541         if (left_el->l_next_free_rec != left_el->l_count) {
1542                 ocfs2_error(inode->i_sb,
1543                             "Inode %llu has non-full interior leaf node %llu"
1544                             "(next free = %u)",
1545                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1546                             (unsigned long long)left_leaf_bh->b_blocknr,
1547                             le16_to_cpu(left_el->l_next_free_rec));
1548                 return -EROFS;
1549         }
1550
1551         /*
1552          * This extent block may already have an empty record, so we
1553          * return early if so.
1554          */
1555         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1556                 return 0;
1557
1558         root_bh = left_path->p_node[subtree_index].bh;
1559         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1560
1561         ret = ocfs2_journal_access(handle, inode, root_bh,
1562                                    OCFS2_JOURNAL_ACCESS_WRITE);
1563         if (ret) {
1564                 mlog_errno(ret);
1565                 goto out;
1566         }
1567
1568         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1569                 ret = ocfs2_journal_access(handle, inode,
1570                                            right_path->p_node[i].bh,
1571                                            OCFS2_JOURNAL_ACCESS_WRITE);
1572                 if (ret) {
1573                         mlog_errno(ret);
1574                         goto out;
1575                 }
1576
1577                 ret = ocfs2_journal_access(handle, inode,
1578                                            left_path->p_node[i].bh,
1579                                            OCFS2_JOURNAL_ACCESS_WRITE);
1580                 if (ret) {
1581                         mlog_errno(ret);
1582                         goto out;
1583                 }
1584         }
1585
1586         right_leaf_bh = path_leaf_bh(right_path);
1587         right_el = path_leaf_el(right_path);
1588
1589         /* This is a code error, not a disk corruption. */
1590         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1591                         "because rightmost leaf block %llu is empty\n",
1592                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1593                         (unsigned long long)right_leaf_bh->b_blocknr);
1594
1595         ocfs2_create_empty_extent(right_el);
1596
1597         ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1598         if (ret) {
1599                 mlog_errno(ret);
1600                 goto out;
1601         }
1602
1603         /* Do the copy now. */
1604         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1605         move_rec = left_el->l_recs[i];
1606         right_el->l_recs[0] = move_rec;
1607
1608         /*
1609          * Clear out the record we just copied and shift everything
1610          * over, leaving an empty extent in the left leaf.
1611          *
1612          * We temporarily subtract from next_free_rec so that the
1613          * shift will lose the tail record (which is now defunct).
1614          */
1615         le16_add_cpu(&left_el->l_next_free_rec, -1);
1616         ocfs2_shift_records_right(left_el);
1617         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1618         le16_add_cpu(&left_el->l_next_free_rec, 1);
1619
1620         ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1621         if (ret) {
1622                 mlog_errno(ret);
1623                 goto out;
1624         }
1625
1626         ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1627                                 subtree_index);
1628
1629 out:
1630         return ret;
1631 }
1632
1633 /*
1634  * Given a full path, determine what cpos value would return us a path
1635  * containing the leaf immediately to the left of the current one.
1636  *
1637  * Will return zero if the path passed in is already the leftmost path.
1638  */
1639 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1640                                          struct ocfs2_path *path, u32 *cpos)
1641 {
1642         int i, j, ret = 0;
1643         u64 blkno;
1644         struct ocfs2_extent_list *el;
1645
1646         BUG_ON(path->p_tree_depth == 0);
1647
1648         *cpos = 0;
1649
1650         blkno = path_leaf_bh(path)->b_blocknr;
1651
1652         /* Start at the tree node just above the leaf and work our way up. */
1653         i = path->p_tree_depth - 1;
1654         while (i >= 0) {
1655                 el = path->p_node[i].el;
1656
1657                 /*
1658                  * Find the extent record just before the one in our
1659                  * path.
1660                  */
1661                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1662                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1663                                 if (j == 0) {
1664                                         if (i == 0) {
1665                                                 /*
1666                                                  * We've determined that the
1667                                                  * path specified is already
1668                                                  * the leftmost one - return a
1669                                                  * cpos of zero.
1670                                                  */
1671                                                 goto out;
1672                                         }
1673                                         /*
1674                                          * The leftmost record points to our
1675                                          * leaf - we need to travel up the
1676                                          * tree one level.
1677                                          */
1678                                         goto next_node;
1679                                 }
1680
1681                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1682                                 *cpos = *cpos + ocfs2_rec_clusters(el,
1683                                                            &el->l_recs[j - 1]);
1684                                 *cpos = *cpos - 1;
1685                                 goto out;
1686                         }
1687                 }
1688
1689                 /*
1690                  * If we got here, we never found a valid node where
1691                  * the tree indicated one should be.
1692                  */
1693                 ocfs2_error(sb,
1694                             "Invalid extent tree at extent block %llu\n",
1695                             (unsigned long long)blkno);
1696                 ret = -EROFS;
1697                 goto out;
1698
1699 next_node:
1700                 blkno = path->p_node[i].bh->b_blocknr;
1701                 i--;
1702         }
1703
1704 out:
1705         return ret;
1706 }
1707
1708 /*
1709  * Extend the transaction by enough credits to complete the rotation,
1710  * and still leave at least the original number of credits allocated
1711  * to this transaction.
1712  */
1713 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1714                                            int op_credits,
1715                                            struct ocfs2_path *path)
1716 {
1717         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
1718
1719         if (handle->h_buffer_credits < credits)
1720                 return ocfs2_extend_trans(handle, credits);
1721
1722         return 0;
1723 }
1724
1725 /*
1726  * Trap the case where we're inserting into the theoretical range past
1727  * the _actual_ left leaf range. Otherwise, we'll rotate a record
1728  * whose cpos is less than ours into the right leaf.
1729  *
1730  * It's only necessary to look at the rightmost record of the left
1731  * leaf because the logic that calls us should ensure that the
1732  * theoretical ranges in the path components above the leaves are
1733  * correct.
1734  */
1735 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1736                                                  u32 insert_cpos)
1737 {
1738         struct ocfs2_extent_list *left_el;
1739         struct ocfs2_extent_rec *rec;
1740         int next_free;
1741
1742         left_el = path_leaf_el(left_path);
1743         next_free = le16_to_cpu(left_el->l_next_free_rec);
1744         rec = &left_el->l_recs[next_free - 1];
1745
1746         if (insert_cpos > le32_to_cpu(rec->e_cpos))
1747                 return 1;
1748         return 0;
1749 }
1750
1751 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
1752 {
1753         int next_free = le16_to_cpu(el->l_next_free_rec);
1754         unsigned int range;
1755         struct ocfs2_extent_rec *rec;
1756
1757         if (next_free == 0)
1758                 return 0;
1759
1760         rec = &el->l_recs[0];
1761         if (ocfs2_is_empty_extent(rec)) {
1762                 /* Empty list. */
1763                 if (next_free == 1)
1764                         return 0;
1765                 rec = &el->l_recs[1];
1766         }
1767
1768         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1769         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1770                 return 1;
1771         return 0;
1772 }
1773
1774 /*
1775  * Rotate all the records in a btree right one record, starting at insert_cpos.
1776  *
1777  * The path to the rightmost leaf should be passed in.
1778  *
1779  * The array is assumed to be large enough to hold an entire path (tree depth).
1780  *
1781  * Upon succesful return from this function:
1782  *
1783  * - The 'right_path' array will contain a path to the leaf block
1784  *   whose range contains e_cpos.
1785  * - That leaf block will have a single empty extent in list index 0.
1786  * - In the case that the rotation requires a post-insert update,
1787  *   *ret_left_path will contain a valid path which can be passed to
1788  *   ocfs2_insert_path().
1789  */
1790 static int ocfs2_rotate_tree_right(struct inode *inode,
1791                                    handle_t *handle,
1792                                    enum ocfs2_split_type split,
1793                                    u32 insert_cpos,
1794                                    struct ocfs2_path *right_path,
1795                                    struct ocfs2_path **ret_left_path)
1796 {
1797         int ret, start, orig_credits = handle->h_buffer_credits;
1798         u32 cpos;
1799         struct ocfs2_path *left_path = NULL;
1800
1801         *ret_left_path = NULL;
1802
1803         left_path = ocfs2_new_path(path_root_bh(right_path),
1804                                    path_root_el(right_path));
1805         if (!left_path) {
1806                 ret = -ENOMEM;
1807                 mlog_errno(ret);
1808                 goto out;
1809         }
1810
1811         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
1812         if (ret) {
1813                 mlog_errno(ret);
1814                 goto out;
1815         }
1816
1817         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
1818
1819         /*
1820          * What we want to do here is:
1821          *
1822          * 1) Start with the rightmost path.
1823          *
1824          * 2) Determine a path to the leaf block directly to the left
1825          *    of that leaf.
1826          *
1827          * 3) Determine the 'subtree root' - the lowest level tree node
1828          *    which contains a path to both leaves.
1829          *
1830          * 4) Rotate the subtree.
1831          *
1832          * 5) Find the next subtree by considering the left path to be
1833          *    the new right path.
1834          *
1835          * The check at the top of this while loop also accepts
1836          * insert_cpos == cpos because cpos is only a _theoretical_
1837          * value to get us the left path - insert_cpos might very well
1838          * be filling that hole.
1839          *
1840          * Stop at a cpos of '0' because we either started at the
1841          * leftmost branch (i.e., a tree with one branch and a
1842          * rotation inside of it), or we've gone as far as we can in
1843          * rotating subtrees.
1844          */
1845         while (cpos && insert_cpos <= cpos) {
1846                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
1847                      insert_cpos, cpos);
1848
1849                 ret = ocfs2_find_path(inode, left_path, cpos);
1850                 if (ret) {
1851                         mlog_errno(ret);
1852                         goto out;
1853                 }
1854
1855                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
1856                                 path_leaf_bh(right_path),
1857                                 "Inode %lu: error during insert of %u "
1858                                 "(left path cpos %u) results in two identical "
1859                                 "paths ending at %llu\n",
1860                                 inode->i_ino, insert_cpos, cpos,
1861                                 (unsigned long long)
1862                                 path_leaf_bh(left_path)->b_blocknr);
1863
1864                 if (split == SPLIT_NONE &&
1865                     ocfs2_rotate_requires_path_adjustment(left_path,
1866                                                           insert_cpos)) {
1867
1868                         /*
1869                          * We've rotated the tree as much as we
1870                          * should. The rest is up to
1871                          * ocfs2_insert_path() to complete, after the
1872                          * record insertion. We indicate this
1873                          * situation by returning the left path.
1874                          *
1875                          * The reason we don't adjust the records here
1876                          * before the record insert is that an error
1877                          * later might break the rule where a parent
1878                          * record e_cpos will reflect the actual
1879                          * e_cpos of the 1st nonempty record of the
1880                          * child list.
1881                          */
1882                         *ret_left_path = left_path;
1883                         goto out_ret_path;
1884                 }
1885
1886                 start = ocfs2_find_subtree_root(inode, left_path, right_path);
1887
1888                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
1889                      start,
1890                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
1891                      right_path->p_tree_depth);
1892
1893                 ret = ocfs2_extend_rotate_transaction(handle, start,
1894                                                       orig_credits, right_path);
1895                 if (ret) {
1896                         mlog_errno(ret);
1897                         goto out;
1898                 }
1899
1900                 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
1901                                                  right_path, start);
1902                 if (ret) {
1903                         mlog_errno(ret);
1904                         goto out;
1905                 }
1906
1907                 if (split != SPLIT_NONE &&
1908                     ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
1909                                                 insert_cpos)) {
1910                         /*
1911                          * A rotate moves the rightmost left leaf
1912                          * record over to the leftmost right leaf
1913                          * slot. If we're doing an extent split
1914                          * instead of a real insert, then we have to
1915                          * check that the extent to be split wasn't
1916                          * just moved over. If it was, then we can
1917                          * exit here, passing left_path back -
1918                          * ocfs2_split_extent() is smart enough to
1919                          * search both leaves.
1920                          */
1921                         *ret_left_path = left_path;
1922                         goto out_ret_path;
1923                 }
1924
1925                 /*
1926                  * There is no need to re-read the next right path
1927                  * as we know that it'll be our current left
1928                  * path. Optimize by copying values instead.
1929                  */
1930                 ocfs2_mv_path(right_path, left_path);
1931
1932                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1933                                                     &cpos);
1934                 if (ret) {
1935                         mlog_errno(ret);
1936                         goto out;
1937                 }
1938         }
1939
1940 out:
1941         ocfs2_free_path(left_path);
1942
1943 out_ret_path:
1944         return ret;
1945 }
1946
1947 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
1948                                       struct ocfs2_path *path)
1949 {
1950         int i, idx;
1951         struct ocfs2_extent_rec *rec;
1952         struct ocfs2_extent_list *el;
1953         struct ocfs2_extent_block *eb;
1954         u32 range;
1955
1956         /* Path should always be rightmost. */
1957         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
1958         BUG_ON(eb->h_next_leaf_blk != 0ULL);
1959
1960         el = &eb->h_list;
1961         BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1962         idx = le16_to_cpu(el->l_next_free_rec) - 1;
1963         rec = &el->l_recs[idx];
1964         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1965
1966         for (i = 0; i < path->p_tree_depth; i++) {
1967                 el = path->p_node[i].el;
1968                 idx = le16_to_cpu(el->l_next_free_rec) - 1;
1969                 rec = &el->l_recs[idx];
1970
1971                 rec->e_int_clusters = cpu_to_le32(range);
1972                 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
1973
1974                 ocfs2_journal_dirty(handle, path->p_node[i].bh);
1975         }
1976 }
1977
1978 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
1979                               struct ocfs2_cached_dealloc_ctxt *dealloc,
1980                               struct ocfs2_path *path, int unlink_start)
1981 {
1982         int ret, i;
1983         struct ocfs2_extent_block *eb;
1984         struct ocfs2_extent_list *el;
1985         struct buffer_head *bh;
1986
1987         for(i = unlink_start; i < path_num_items(path); i++) {
1988                 bh = path->p_node[i].bh;
1989
1990                 eb = (struct ocfs2_extent_block *)bh->b_data;
1991                 /*
1992                  * Not all nodes might have had their final count
1993                  * decremented by the caller - handle this here.
1994                  */
1995                 el = &eb->h_list;
1996                 if (le16_to_cpu(el->l_next_free_rec) > 1) {
1997                         mlog(ML_ERROR,
1998                              "Inode %llu, attempted to remove extent block "
1999                              "%llu with %u records\n",
2000                              (unsigned long long)OCFS2_I(inode)->ip_blkno,
2001                              (unsigned long long)le64_to_cpu(eb->h_blkno),
2002                              le16_to_cpu(el->l_next_free_rec));
2003
2004                         ocfs2_journal_dirty(handle, bh);
2005                         ocfs2_remove_from_cache(inode, bh);
2006                         continue;
2007                 }
2008
2009                 el->l_next_free_rec = 0;
2010                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2011
2012                 ocfs2_journal_dirty(handle, bh);
2013
2014                 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2015                 if (ret)
2016                         mlog_errno(ret);
2017
2018                 ocfs2_remove_from_cache(inode, bh);
2019         }
2020 }
2021
2022 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2023                                  struct ocfs2_path *left_path,
2024                                  struct ocfs2_path *right_path,
2025                                  int subtree_index,
2026                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
2027 {
2028         int i;
2029         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2030         struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2031         struct ocfs2_extent_list *el;
2032         struct ocfs2_extent_block *eb;
2033
2034         el = path_leaf_el(left_path);
2035
2036         eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2037
2038         for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2039                 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2040                         break;
2041
2042         BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2043
2044         memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2045         le16_add_cpu(&root_el->l_next_free_rec, -1);
2046
2047         eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2048         eb->h_next_leaf_blk = 0;
2049
2050         ocfs2_journal_dirty(handle, root_bh);
2051         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2052
2053         ocfs2_unlink_path(inode, handle, dealloc, right_path,
2054                           subtree_index + 1);
2055 }
2056
2057 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2058                                      struct ocfs2_path *left_path,
2059                                      struct ocfs2_path *right_path,
2060                                      int subtree_index,
2061                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2062                                      int *deleted)
2063 {
2064         int ret, i, del_right_subtree = 0, right_has_empty = 0;
2065         struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
2066         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2067         struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2068         struct ocfs2_extent_block *eb;
2069
2070         *deleted = 0;
2071
2072         right_leaf_el = path_leaf_el(right_path);
2073         left_leaf_el = path_leaf_el(left_path);
2074         root_bh = left_path->p_node[subtree_index].bh;
2075         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2076
2077         if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2078                 return 0;
2079
2080         eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2081         if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2082                 /*
2083                  * It's legal for us to proceed if the right leaf is
2084                  * the rightmost one and it has an empty extent. There
2085                  * are two cases to handle - whether the leaf will be
2086                  * empty after removal or not. If the leaf isn't empty
2087                  * then just remove the empty extent up front. The
2088                  * next block will handle empty leaves by flagging
2089                  * them for unlink.
2090                  *
2091                  * Non rightmost leaves will throw -EAGAIN and the
2092                  * caller can manually move the subtree and retry.
2093                  */
2094
2095                 if (eb->h_next_leaf_blk != 0ULL)
2096                         return -EAGAIN;
2097
2098                 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2099                         ret = ocfs2_journal_access(handle, inode,
2100                                                    path_leaf_bh(right_path),
2101                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2102                         if (ret) {
2103                                 mlog_errno(ret);
2104                                 goto out;
2105                         }
2106
2107                         ocfs2_remove_empty_extent(right_leaf_el);
2108                 } else
2109                         right_has_empty = 1;
2110         }
2111
2112         if (eb->h_next_leaf_blk == 0ULL &&
2113             le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2114                 /*
2115                  * We have to update i_last_eb_blk during the meta
2116                  * data delete.
2117                  */
2118                 ret = ocfs2_journal_access(handle, inode, di_bh,
2119                                            OCFS2_JOURNAL_ACCESS_WRITE);
2120                 if (ret) {
2121                         mlog_errno(ret);
2122                         goto out;
2123                 }
2124
2125                 del_right_subtree = 1;
2126         }
2127
2128         /*
2129          * Getting here with an empty extent in the right path implies
2130          * that it's the rightmost path and will be deleted.
2131          */
2132         BUG_ON(right_has_empty && !del_right_subtree);
2133
2134         ret = ocfs2_journal_access(handle, inode, root_bh,
2135                                    OCFS2_JOURNAL_ACCESS_WRITE);
2136         if (ret) {
2137                 mlog_errno(ret);
2138                 goto out;
2139         }
2140
2141         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2142                 ret = ocfs2_journal_access(handle, inode,
2143                                            right_path->p_node[i].bh,
2144                                            OCFS2_JOURNAL_ACCESS_WRITE);
2145                 if (ret) {
2146                         mlog_errno(ret);
2147                         goto out;
2148                 }
2149
2150                 ret = ocfs2_journal_access(handle, inode,
2151                                            left_path->p_node[i].bh,
2152                                            OCFS2_JOURNAL_ACCESS_WRITE);
2153                 if (ret) {
2154                         mlog_errno(ret);
2155                         goto out;
2156                 }
2157         }
2158
2159         if (!right_has_empty) {
2160                 /*
2161                  * Only do this if we're moving a real
2162                  * record. Otherwise, the action is delayed until
2163                  * after removal of the right path in which case we
2164                  * can do a simple shift to remove the empty extent.
2165                  */
2166                 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2167                 memset(&right_leaf_el->l_recs[0], 0,
2168                        sizeof(struct ocfs2_extent_rec));
2169         }
2170         if (eb->h_next_leaf_blk == 0ULL) {
2171                 /*
2172                  * Move recs over to get rid of empty extent, decrease
2173                  * next_free. This is allowed to remove the last
2174                  * extent in our leaf (setting l_next_free_rec to
2175                  * zero) - the delete code below won't care.
2176                  */
2177                 ocfs2_remove_empty_extent(right_leaf_el);
2178         }
2179
2180         ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2181         if (ret)
2182                 mlog_errno(ret);
2183         ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2184         if (ret)
2185                 mlog_errno(ret);
2186
2187         if (del_right_subtree) {
2188                 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2189                                      subtree_index, dealloc);
2190                 ocfs2_update_edge_lengths(inode, handle, left_path);
2191
2192                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2193                 di->i_last_eb_blk = eb->h_blkno;
2194
2195                 /*
2196                  * Removal of the extent in the left leaf was skipped
2197                  * above so we could delete the right path
2198                  * 1st.
2199                  */
2200                 if (right_has_empty)
2201                         ocfs2_remove_empty_extent(left_leaf_el);
2202
2203                 ret = ocfs2_journal_dirty(handle, di_bh);
2204                 if (ret)
2205                         mlog_errno(ret);
2206
2207                 *deleted = 1;
2208         } else
2209                 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2210                                            subtree_index);
2211
2212 out:
2213         return ret;
2214 }
2215
2216 /*
2217  * Given a full path, determine what cpos value would return us a path
2218  * containing the leaf immediately to the right of the current one.
2219  *
2220  * Will return zero if the path passed in is already the rightmost path.
2221  *
2222  * This looks similar, but is subtly different to
2223  * ocfs2_find_cpos_for_left_leaf().
2224  */
2225 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2226                                           struct ocfs2_path *path, u32 *cpos)
2227 {
2228         int i, j, ret = 0;
2229         u64 blkno;
2230         struct ocfs2_extent_list *el;
2231
2232         *cpos = 0;
2233
2234         if (path->p_tree_depth == 0)
2235                 return 0;
2236
2237         blkno = path_leaf_bh(path)->b_blocknr;
2238
2239         /* Start at the tree node just above the leaf and work our way up. */
2240         i = path->p_tree_depth - 1;
2241         while (i >= 0) {
2242                 int next_free;
2243
2244                 el = path->p_node[i].el;
2245
2246                 /*
2247                  * Find the extent record just after the one in our
2248                  * path.
2249                  */
2250                 next_free = le16_to_cpu(el->l_next_free_rec);
2251                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2252                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2253                                 if (j == (next_free - 1)) {
2254                                         if (i == 0) {
2255                                                 /*
2256                                                  * We've determined that the
2257                                                  * path specified is already
2258                                                  * the rightmost one - return a
2259                                                  * cpos of zero.
2260                                                  */
2261                                                 goto out;
2262                                         }
2263                                         /*
2264                                          * The rightmost record points to our
2265                                          * leaf - we need to travel up the
2266                                          * tree one level.
2267                                          */
2268                                         goto next_node;
2269                                 }
2270
2271                                 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2272                                 goto out;
2273                         }
2274                 }
2275
2276                 /*
2277                  * If we got here, we never found a valid node where
2278                  * the tree indicated one should be.
2279                  */
2280                 ocfs2_error(sb,
2281                             "Invalid extent tree at extent block %llu\n",
2282                             (unsigned long long)blkno);
2283                 ret = -EROFS;
2284                 goto out;
2285
2286 next_node:
2287                 blkno = path->p_node[i].bh->b_blocknr;
2288                 i--;
2289         }
2290
2291 out:
2292         return ret;
2293 }
2294
2295 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2296                                             handle_t *handle,
2297                                             struct buffer_head *bh,
2298                                             struct ocfs2_extent_list *el)
2299 {
2300         int ret;
2301
2302         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2303                 return 0;
2304
2305         ret = ocfs2_journal_access(handle, inode, bh,
2306                                    OCFS2_JOURNAL_ACCESS_WRITE);
2307         if (ret) {
2308                 mlog_errno(ret);
2309                 goto out;
2310         }
2311
2312         ocfs2_remove_empty_extent(el);
2313
2314         ret = ocfs2_journal_dirty(handle, bh);
2315         if (ret)
2316                 mlog_errno(ret);
2317
2318 out:
2319         return ret;
2320 }
2321
2322 static int __ocfs2_rotate_tree_left(struct inode *inode,
2323                                     handle_t *handle, int orig_credits,
2324                                     struct ocfs2_path *path,
2325                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2326                                     struct ocfs2_path **empty_extent_path)
2327 {
2328         int ret, subtree_root, deleted;
2329         u32 right_cpos;
2330         struct ocfs2_path *left_path = NULL;
2331         struct ocfs2_path *right_path = NULL;
2332
2333         BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2334
2335         *empty_extent_path = NULL;
2336
2337         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2338                                              &right_cpos);
2339         if (ret) {
2340                 mlog_errno(ret);
2341                 goto out;
2342         }
2343
2344         left_path = ocfs2_new_path(path_root_bh(path),
2345                                    path_root_el(path));
2346         if (!left_path) {
2347                 ret = -ENOMEM;
2348                 mlog_errno(ret);
2349                 goto out;
2350         }
2351
2352         ocfs2_cp_path(left_path, path);
2353
2354         right_path = ocfs2_new_path(path_root_bh(path),
2355                                     path_root_el(path));
2356         if (!right_path) {
2357                 ret = -ENOMEM;
2358                 mlog_errno(ret);
2359                 goto out;
2360         }
2361
2362         while (right_cpos) {
2363                 ret = ocfs2_find_path(inode, right_path, right_cpos);
2364                 if (ret) {
2365                         mlog_errno(ret);
2366                         goto out;
2367                 }
2368
2369                 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2370                                                        right_path);
2371
2372                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2373                      subtree_root,
2374                      (unsigned long long)
2375                      right_path->p_node[subtree_root].bh->b_blocknr,
2376                      right_path->p_tree_depth);
2377
2378                 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2379                                                       orig_credits, left_path);
2380                 if (ret) {
2381                         mlog_errno(ret);
2382                         goto out;
2383                 }
2384
2385                 /*
2386                  * Caller might still want to make changes to the
2387                  * tree root, so re-add it to the journal here.
2388                  */
2389                 ret = ocfs2_journal_access(handle, inode,
2390                                            path_root_bh(left_path),
2391                                            OCFS2_JOURNAL_ACCESS_WRITE);
2392                 if (ret) {
2393                         mlog_errno(ret);
2394                         goto out;
2395                 }
2396
2397                 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2398                                                 right_path, subtree_root,
2399                                                 dealloc, &deleted);
2400                 if (ret == -EAGAIN) {
2401                         /*
2402                          * The rotation has to temporarily stop due to
2403                          * the right subtree having an empty
2404                          * extent. Pass it back to the caller for a
2405                          * fixup.
2406                          */
2407                         *empty_extent_path = right_path;
2408                         right_path = NULL;
2409                         goto out;
2410                 }
2411                 if (ret) {
2412                         mlog_errno(ret);
2413                         goto out;
2414                 }
2415
2416                 /*
2417                  * The subtree rotate might have removed records on
2418                  * the rightmost edge. If so, then rotation is
2419                  * complete.
2420                  */
2421                 if (deleted)
2422                         break;
2423
2424                 ocfs2_mv_path(left_path, right_path);
2425
2426                 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2427                                                      &right_cpos);
2428                 if (ret) {
2429                         mlog_errno(ret);
2430                         goto out;
2431                 }
2432         }
2433
2434 out:
2435         ocfs2_free_path(right_path);
2436         ocfs2_free_path(left_path);
2437
2438         return ret;
2439 }
2440
2441 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2442                                        struct ocfs2_path *path,
2443                                        struct ocfs2_cached_dealloc_ctxt *dealloc)
2444 {
2445         int ret, subtree_index;
2446         u32 cpos;
2447         struct ocfs2_path *left_path = NULL;
2448         struct ocfs2_dinode *di;
2449         struct ocfs2_extent_block *eb;
2450         struct ocfs2_extent_list *el;
2451
2452         /*
2453          * XXX: This code assumes that the root is an inode, which is
2454          * true for now but may change as tree code gets generic.
2455          */
2456         di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
2457         if (!OCFS2_IS_VALID_DINODE(di)) {
2458                 ret = -EIO;
2459                 ocfs2_error(inode->i_sb,
2460                             "Inode %llu has invalid path root",
2461                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
2462                 goto out;
2463         }
2464
2465         /*
2466          * There's two ways we handle this depending on
2467          * whether path is the only existing one.
2468          */
2469         ret = ocfs2_extend_rotate_transaction(handle, 0,
2470                                               handle->h_buffer_credits,
2471                                               path);
2472         if (ret) {
2473                 mlog_errno(ret);
2474                 goto out;
2475         }
2476
2477         ret = ocfs2_journal_access_path(inode, handle, path);
2478         if (ret) {
2479                 mlog_errno(ret);
2480                 goto out;
2481         }
2482
2483         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2484         if (ret) {
2485                 mlog_errno(ret);
2486                 goto out;
2487         }
2488
2489         if (cpos) {
2490                 /*
2491                  * We have a path to the left of this one - it needs
2492                  * an update too.
2493                  */
2494                 left_path = ocfs2_new_path(path_root_bh(path),
2495                                            path_root_el(path));
2496                 if (!left_path) {
2497                         ret = -ENOMEM;
2498                         mlog_errno(ret);
2499                         goto out;
2500                 }
2501
2502                 ret = ocfs2_find_path(inode, left_path, cpos);
2503                 if (ret) {
2504                         mlog_errno(ret);
2505                         goto out;
2506                 }
2507
2508                 ret = ocfs2_journal_access_path(inode, handle, left_path);
2509                 if (ret) {
2510                         mlog_errno(ret);
2511                         goto out;
2512                 }
2513
2514                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2515
2516                 ocfs2_unlink_subtree(inode, handle, left_path, path,
2517                                      subtree_index, dealloc);
2518                 ocfs2_update_edge_lengths(inode, handle, left_path);
2519
2520                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2521                 di->i_last_eb_blk = eb->h_blkno;
2522         } else {
2523                 /*
2524                  * 'path' is also the leftmost path which
2525                  * means it must be the only one. This gets
2526                  * handled differently because we want to
2527                  * revert the inode back to having extents
2528                  * in-line.
2529                  */
2530                 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2531
2532                 el = &di->id2.i_list;
2533                 el->l_tree_depth = 0;
2534                 el->l_next_free_rec = 0;
2535                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2536
2537                 di->i_last_eb_blk = 0;
2538         }
2539
2540         ocfs2_journal_dirty(handle, path_root_bh(path));
2541
2542 out:
2543         ocfs2_free_path(left_path);
2544         return ret;
2545 }
2546
2547 /*
2548  * Left rotation of btree records.
2549  *
2550  * In many ways, this is (unsurprisingly) the opposite of right
2551  * rotation. We start at some non-rightmost path containing an empty
2552  * extent in the leaf block. The code works its way to the rightmost
2553  * path by rotating records to the left in every subtree.
2554  *
2555  * This is used by any code which reduces the number of extent records
2556  * in a leaf. After removal, an empty record should be placed in the
2557  * leftmost list position.
2558  *
2559  * This won't handle a length update of the rightmost path records if
2560  * the rightmost tree leaf record is removed so the caller is
2561  * responsible for detecting and correcting that.
2562  */
2563 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2564                                   struct ocfs2_path *path,
2565                                   struct ocfs2_cached_dealloc_ctxt *dealloc)
2566 {
2567         int ret, orig_credits = handle->h_buffer_credits;
2568         struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2569         struct ocfs2_extent_block *eb;
2570         struct ocfs2_extent_list *el;
2571
2572         el = path_leaf_el(path);
2573         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2574                 return 0;
2575
2576         if (path->p_tree_depth == 0) {
2577 rightmost_no_delete:
2578                 /*
2579                  * In-inode extents. This is trivially handled, so do
2580                  * it up front.
2581                  */
2582                 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2583                                                        path_leaf_bh(path),
2584                                                        path_leaf_el(path));
2585                 if (ret)
2586                         mlog_errno(ret);
2587                 goto out;
2588         }
2589
2590         /*
2591          * Handle rightmost branch now. There's several cases:
2592          *  1) simple rotation leaving records in there. That's trivial.
2593          *  2) rotation requiring a branch delete - there's no more
2594          *     records left. Two cases of this:
2595          *     a) There are branches to the left.
2596          *     b) This is also the leftmost (the only) branch.
2597          *
2598          *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2599          *  2a) we need the left branch so that we can update it with the unlink
2600          *  2b) we need to bring the inode back to inline extents.
2601          */
2602
2603         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2604         el = &eb->h_list;
2605         if (eb->h_next_leaf_blk == 0) {
2606                 /*
2607                  * This gets a bit tricky if we're going to delete the
2608                  * rightmost path. Get the other cases out of the way
2609                  * 1st.
2610                  */
2611                 if (le16_to_cpu(el->l_next_free_rec) > 1)
2612                         goto rightmost_no_delete;
2613
2614                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
2615                         ret = -EIO;
2616                         ocfs2_error(inode->i_sb,
2617                                     "Inode %llu has empty extent block at %llu",
2618                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2619                                     (unsigned long long)le64_to_cpu(eb->h_blkno));
2620                         goto out;
2621                 }
2622
2623                 /*
2624                  * XXX: The caller can not trust "path" any more after
2625                  * this as it will have been deleted. What do we do?
2626                  *
2627                  * In theory the rotate-for-merge code will never get
2628                  * here because it'll always ask for a rotate in a
2629                  * nonempty list.
2630                  */
2631
2632                 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2633                                                   dealloc);
2634                 if (ret)
2635                         mlog_errno(ret);
2636                 goto out;
2637         }
2638
2639         /*
2640          * Now we can loop, remembering the path we get from -EAGAIN
2641          * and restarting from there.
2642          */
2643 try_rotate:
2644         ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2645                                        dealloc, &restart_path);
2646         if (ret && ret != -EAGAIN) {
2647                 mlog_errno(ret);
2648                 goto out;
2649         }
2650
2651         while (ret == -EAGAIN) {
2652                 tmp_path = restart_path;
2653                 restart_path = NULL;
2654
2655                 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2656                                                tmp_path, dealloc,
2657                                                &restart_path);
2658                 if (ret && ret != -EAGAIN) {
2659                         mlog_errno(ret);
2660                         goto out;
2661                 }
2662
2663                 ocfs2_free_path(tmp_path);
2664                 tmp_path = NULL;
2665
2666                 if (ret == 0)
2667                         goto try_rotate;
2668         }
2669
2670 out:
2671         ocfs2_free_path(tmp_path);
2672         ocfs2_free_path(restart_path);
2673         return ret;
2674 }
2675
2676 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2677                                 int index)
2678 {
2679         struct ocfs2_extent_rec *rec = &el->l_recs[index];
2680         unsigned int size;
2681
2682         if (rec->e_leaf_clusters == 0) {
2683                 /*
2684                  * We consumed all of the merged-from record. An empty
2685                  * extent cannot exist anywhere but the 1st array
2686                  * position, so move things over if the merged-from
2687                  * record doesn't occupy that position.
2688                  *
2689                  * This creates a new empty extent so the caller
2690                  * should be smart enough to have removed any existing
2691                  * ones.
2692                  */
2693                 if (index > 0) {
2694                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2695                         size = index * sizeof(struct ocfs2_extent_rec);
2696                         memmove(&el->l_recs[1], &el->l_recs[0], size);
2697                 }
2698
2699                 /*
2700                  * Always memset - the caller doesn't check whether it
2701                  * created an empty extent, so there could be junk in
2702                  * the other fields.
2703                  */
2704                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2705         }
2706 }
2707
2708 static int ocfs2_get_right_path(struct inode *inode,
2709                                 struct ocfs2_path *left_path,
2710                                 struct ocfs2_path **ret_right_path)
2711 {
2712         int ret;
2713         u32 right_cpos;
2714         struct ocfs2_path *right_path = NULL;
2715         struct ocfs2_extent_list *left_el;
2716
2717         *ret_right_path = NULL;
2718
2719         /* This function shouldn't be called for non-trees. */
2720         BUG_ON(left_path->p_tree_depth == 0);
2721
2722         left_el = path_leaf_el(left_path);
2723         BUG_ON(left_el->l_next_free_rec != left_el->l_count);
2724
2725         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2726                                              &right_cpos);
2727         if (ret) {
2728                 mlog_errno(ret);
2729                 goto out;
2730         }
2731
2732         /* This function shouldn't be called for the rightmost leaf. */
2733         BUG_ON(right_cpos == 0);
2734
2735         right_path = ocfs2_new_path(path_root_bh(left_path),
2736                                     path_root_el(left_path));
2737         if (!right_path) {
2738                 ret = -ENOMEM;
2739                 mlog_errno(ret);
2740                 goto out;
2741         }
2742
2743         ret = ocfs2_find_path(inode, right_path, right_cpos);
2744         if (ret) {
2745                 mlog_errno(ret);
2746                 goto out;
2747         }
2748
2749         *ret_right_path = right_path;
2750 out:
2751         if (ret)
2752                 ocfs2_free_path(right_path);
2753         return ret;
2754 }
2755
2756 /*
2757  * Remove split_rec clusters from the record at index and merge them
2758  * onto the beginning of the record "next" to it.
2759  * For index < l_count - 1, the next means the extent rec at index + 1.
2760  * For index == l_count - 1, the "next" means the 1st extent rec of the
2761  * next extent block.
2762  */
2763 static int ocfs2_merge_rec_right(struct inode *inode,
2764                                  struct ocfs2_path *left_path,
2765                                  handle_t *handle,
2766                                  struct ocfs2_extent_rec *split_rec,
2767                                  int index)
2768 {
2769         int ret, next_free, i;
2770         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2771         struct ocfs2_extent_rec *left_rec;
2772         struct ocfs2_extent_rec *right_rec;
2773         struct ocfs2_extent_list *right_el;
2774         struct ocfs2_path *right_path = NULL;
2775         int subtree_index = 0;
2776         struct ocfs2_extent_list *el = path_leaf_el(left_path);
2777         struct buffer_head *bh = path_leaf_bh(left_path);
2778         struct buffer_head *root_bh = NULL;
2779
2780         BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
2781         left_rec = &el->l_recs[index];
2782
2783         if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
2784             le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
2785                 /* we meet with a cross extent block merge. */
2786                 ret = ocfs2_get_right_path(inode, left_path, &right_path);
2787                 if (ret) {
2788                         mlog_errno(ret);
2789                         goto out;
2790                 }
2791
2792                 right_el = path_leaf_el(right_path);
2793                 next_free = le16_to_cpu(right_el->l_next_free_rec);
2794                 BUG_ON(next_free <= 0);
2795                 right_rec = &right_el->l_recs[0];
2796                 if (ocfs2_is_empty_extent(right_rec)) {
2797                         BUG_ON(next_free <= 1);
2798                         right_rec = &right_el->l_recs[1];
2799                 }
2800
2801                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
2802                        le16_to_cpu(left_rec->e_leaf_clusters) !=
2803                        le32_to_cpu(right_rec->e_cpos));
2804
2805                 subtree_index = ocfs2_find_subtree_root(inode,
2806                                                         left_path, right_path);
2807
2808                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
2809                                                       handle->h_buffer_credits,
2810                                                       right_path);
2811                 if (ret) {
2812                         mlog_errno(ret);
2813                         goto out;
2814                 }
2815
2816                 root_bh = left_path->p_node[subtree_index].bh;
2817                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2818
2819                 ret = ocfs2_journal_access(handle, inode, root_bh,
2820                                            OCFS2_JOURNAL_ACCESS_WRITE);
2821                 if (ret) {
2822                         mlog_errno(ret);
2823                         goto out;
2824                 }
2825
2826                 for (i = subtree_index + 1;
2827                      i < path_num_items(right_path); i++) {
2828                         ret = ocfs2_journal_access(handle, inode,
2829                                                    right_path->p_node[i].bh,
2830                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2831                         if (ret) {
2832                                 mlog_errno(ret);
2833                                 goto out;
2834                         }
2835
2836                         ret = ocfs2_journal_access(handle, inode,
2837                                                    left_path->p_node[i].bh,
2838                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2839                         if (ret) {
2840                                 mlog_errno(ret);
2841                                 goto out;
2842                         }
2843                 }
2844
2845         } else {
2846                 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
2847                 right_rec = &el->l_recs[index + 1];
2848         }
2849
2850         ret = ocfs2_journal_access(handle, inode, bh,
2851                                    OCFS2_JOURNAL_ACCESS_WRITE);
2852         if (ret) {
2853                 mlog_errno(ret);
2854                 goto out;
2855         }
2856
2857         le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
2858
2859         le32_add_cpu(&right_rec->e_cpos, -split_clusters);
2860         le64_add_cpu(&right_rec->e_blkno,
2861                      -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2862         le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
2863
2864         ocfs2_cleanup_merge(el, index);
2865
2866         ret = ocfs2_journal_dirty(handle, bh);
2867         if (ret)
2868                 mlog_errno(ret);
2869
2870         if (right_path) {
2871                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2872                 if (ret)
2873                         mlog_errno(ret);
2874
2875                 ocfs2_complete_edge_insert(inode, handle, left_path,
2876                                            right_path, subtree_index);
2877         }
2878 out:
2879         if (right_path)
2880                 ocfs2_free_path(right_path);
2881         return ret;
2882 }
2883
2884 static int ocfs2_get_left_path(struct inode *inode,
2885                                struct ocfs2_path *right_path,
2886                                struct ocfs2_path **ret_left_path)
2887 {
2888         int ret;
2889         u32 left_cpos;
2890         struct ocfs2_path *left_path = NULL;
2891
2892         *ret_left_path = NULL;
2893
2894         /* This function shouldn't be called for non-trees. */
2895         BUG_ON(right_path->p_tree_depth == 0);
2896
2897         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
2898                                             right_path, &left_cpos);
2899         if (ret) {
2900                 mlog_errno(ret);
2901                 goto out;
2902         }
2903
2904         /* This function shouldn't be called for the leftmost leaf. */
2905         BUG_ON(left_cpos == 0);
2906
2907         left_path = ocfs2_new_path(path_root_bh(right_path),
2908                                    path_root_el(right_path));
2909         if (!left_path) {
2910                 ret = -ENOMEM;
2911                 mlog_errno(ret);
2912                 goto out;
2913         }
2914
2915         ret = ocfs2_find_path(inode, left_path, left_cpos);
2916         if (ret) {
2917                 mlog_errno(ret);
2918                 goto out;
2919         }
2920
2921         *ret_left_path = left_path;
2922 out:
2923         if (ret)
2924                 ocfs2_free_path(left_path);
2925         return ret;
2926 }
2927
2928 /*
2929  * Remove split_rec clusters from the record at index and merge them
2930  * onto the tail of the record "before" it.
2931  * For index > 0, the "before" means the extent rec at index - 1.
2932  *
2933  * For index == 0, the "before" means the last record of the previous
2934  * extent block. And there is also a situation that we may need to
2935  * remove the rightmost leaf extent block in the right_path and change
2936  * the right path to indicate the new rightmost path.
2937  */
2938 static int ocfs2_merge_rec_left(struct inode *inode,
2939                                 struct ocfs2_path *right_path,
2940                                 handle_t *handle,
2941                                 struct ocfs2_extent_rec *split_rec,
2942                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
2943                                 int index)
2944 {
2945         int ret, i, subtree_index = 0, has_empty_extent = 0;
2946         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2947         struct ocfs2_extent_rec *left_rec;
2948         struct ocfs2_extent_rec *right_rec;
2949         struct ocfs2_extent_list *el = path_leaf_el(right_path);
2950         struct buffer_head *bh = path_leaf_bh(right_path);
2951         struct buffer_head *root_bh = NULL;
2952         struct ocfs2_path *left_path = NULL;
2953         struct ocfs2_extent_list *left_el;
2954
2955         BUG_ON(index < 0);
2956
2957         right_rec = &el->l_recs[index];
2958         if (index == 0) {
2959                 /* we meet with a cross extent block merge. */
2960                 ret = ocfs2_get_left_path(inode, right_path, &left_path);
2961                 if (ret) {
2962                         mlog_errno(ret);
2963                         goto out;
2964                 }
2965
2966                 left_el = path_leaf_el(left_path);
2967                 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
2968                        le16_to_cpu(left_el->l_count));
2969
2970                 left_rec = &left_el->l_recs[
2971                                 le16_to_cpu(left_el->l_next_free_rec) - 1];
2972                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
2973                        le16_to_cpu(left_rec->e_leaf_clusters) !=
2974                        le32_to_cpu(split_rec->e_cpos));
2975
2976                 subtree_index = ocfs2_find_subtree_root(inode,
2977                                                         left_path, right_path);
2978
2979                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
2980                                                       handle->h_buffer_credits,
2981                                                       left_path);
2982                 if (ret) {
2983                         mlog_errno(ret);
2984                         goto out;
2985                 }
2986
2987                 root_bh = left_path->p_node[subtree_index].bh;
2988                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2989
2990                 ret = ocfs2_journal_access(handle, inode, root_bh,
2991                                            OCFS2_JOURNAL_ACCESS_WRITE);
2992                 if (ret) {
2993                         mlog_errno(ret);
2994                         goto out;
2995                 }
2996
2997                 for (i = subtree_index + 1;
2998                      i < path_num_items(right_path); i++) {
2999                         ret = ocfs2_journal_access(handle, inode,
3000                                                    right_path->p_node[i].bh,
3001                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3002                         if (ret) {
3003                                 mlog_errno(ret);
3004                                 goto out;
3005                         }
3006
3007                         ret = ocfs2_journal_access(handle, inode,
3008                                                    left_path->p_node[i].bh,
3009                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3010                         if (ret) {
3011                                 mlog_errno(ret);
3012                                 goto out;
3013                         }
3014                 }
3015         } else {
3016                 left_rec = &el->l_recs[index - 1];
3017                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3018                         has_empty_extent = 1;
3019         }
3020
3021         ret = ocfs2_journal_access(handle, inode, bh,
3022                                    OCFS2_JOURNAL_ACCESS_WRITE);
3023         if (ret) {
3024                 mlog_errno(ret);
3025                 goto out;
3026         }
3027
3028         if (has_empty_extent && index == 1) {
3029                 /*
3030                  * The easy case - we can just plop the record right in.
3031                  */
3032                 *left_rec = *split_rec;
3033
3034                 has_empty_extent = 0;
3035         } else
3036                 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3037
3038         le32_add_cpu(&right_rec->e_cpos, split_clusters);
3039         le64_add_cpu(&right_rec->e_blkno,
3040                      ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3041         le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3042
3043         ocfs2_cleanup_merge(el, index);
3044
3045         ret = ocfs2_journal_dirty(handle, bh);
3046         if (ret)
3047                 mlog_errno(ret);
3048
3049         if (left_path) {
3050                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3051                 if (ret)
3052                         mlog_errno(ret);
3053
3054                 /*
3055                  * In the situation that the right_rec is empty and the extent
3056                  * block is empty also,  ocfs2_complete_edge_insert can't handle
3057                  * it and we need to delete the right extent block.
3058                  */
3059                 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3060                     le16_to_cpu(el->l_next_free_rec) == 1) {
3061
3062                         ret = ocfs2_remove_rightmost_path(inode, handle,
3063                                                           right_path, dealloc);
3064                         if (ret) {
3065                                 mlog_errno(ret);
3066                                 goto out;
3067                         }
3068
3069                         /* Now the rightmost extent block has been deleted.
3070                          * So we use the new rightmost path.
3071                          */
3072                         ocfs2_mv_path(right_path, left_path);
3073                         left_path = NULL;
3074                 } else
3075                         ocfs2_complete_edge_insert(inode, handle, left_path,
3076                                                    right_path, subtree_index);
3077         }
3078 out:
3079         if (left_path)
3080                 ocfs2_free_path(left_path);
3081         return ret;
3082 }
3083
3084 static int ocfs2_try_to_merge_extent(struct inode *inode,
3085                                      handle_t *handle,
3086                                      struct ocfs2_path *path,
3087                                      int split_index,
3088                                      struct ocfs2_extent_rec *split_rec,
3089                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
3090                                      struct ocfs2_merge_ctxt *ctxt)
3091
3092 {
3093         int ret = 0;
3094         struct ocfs2_extent_list *el = path_leaf_el(path);
3095         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3096
3097         BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3098
3099         if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3100                 /*
3101                  * The merge code will need to create an empty
3102                  * extent to take the place of the newly
3103                  * emptied slot. Remove any pre-existing empty
3104                  * extents - having more than one in a leaf is
3105                  * illegal.
3106                  */
3107                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3108                                              dealloc);
3109                 if (ret) {
3110                         mlog_errno(ret);
3111                         goto out;
3112                 }
3113                 split_index--;
3114                 rec = &el->l_recs[split_index];
3115         }
3116
3117         if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3118                 /*
3119                  * Left-right contig implies this.
3120                  */
3121                 BUG_ON(!ctxt->c_split_covers_rec);
3122
3123                 /*
3124                  * Since the leftright insert always covers the entire
3125                  * extent, this call will delete the insert record
3126                  * entirely, resulting in an empty extent record added to
3127                  * the extent block.
3128                  *
3129                  * Since the adding of an empty extent shifts
3130                  * everything back to the right, there's no need to
3131                  * update split_index here.
3132                  *
3133                  * When the split_index is zero, we need to merge it to the
3134                  * prevoius extent block. It is more efficient and easier
3135                  * if we do merge_right first and merge_left later.
3136                  */
3137                 ret = ocfs2_merge_rec_right(inode, path,
3138                                             handle, split_rec,
3139                                             split_index);
3140                 if (ret) {
3141                         mlog_errno(ret);
3142                         goto out;
3143                 }
3144
3145                 /*
3146                  * We can only get this from logic error above.
3147                  */
3148                 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3149
3150                 /* The merge left us with an empty extent, remove it. */
3151                 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
3152                 if (ret) {
3153                         mlog_errno(ret);
3154                         goto out;
3155                 }
3156
3157                 rec = &el->l_recs[split_index];
3158
3159                 /*
3160                  * Note that we don't pass split_rec here on purpose -
3161                  * we've merged it into the rec already.
3162                  */
3163                 ret = ocfs2_merge_rec_left(inode, path,
3164                                            handle, rec,
3165                                            dealloc,
3166                                            split_index);
3167
3168                 if (ret) {
3169                         mlog_errno(ret);
3170                         goto out;
3171                 }
3172
3173                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3174                                              dealloc);
3175                 /*
3176                  * Error from this last rotate is not critical, so
3177                  * print but don't bubble it up.
3178                  */
3179                 if (ret)
3180                         mlog_errno(ret);
3181                 ret = 0;
3182         } else {
3183                 /*
3184                  * Merge a record to the left or right.
3185                  *
3186                  * 'contig_type' is relative to the existing record,
3187                  * so for example, if we're "right contig", it's to
3188                  * the record on the left (hence the left merge).
3189                  */
3190                 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3191                         ret = ocfs2_merge_rec_left(inode,
3192                                                    path,
3193                                                    handle, split_rec,
3194                                                    dealloc,
3195                                                    split_index);
3196                         if (ret) {
3197                                 mlog_errno(ret);
3198                                 goto out;
3199                         }
3200                 } else {
3201                         ret = ocfs2_merge_rec_right(inode,
3202                                                     path,
3203                                                     handle, split_rec,
3204                                                     split_index);
3205                         if (ret) {
3206                                 mlog_errno(ret);
3207                                 goto out;
3208                         }
3209                 }
3210
3211                 if (ctxt->c_split_covers_rec) {
3212                         /*
3213                          * The merge may have left an empty extent in
3214                          * our leaf. Try to rotate it away.
3215                          */
3216                         ret = ocfs2_rotate_tree_left(inode, handle, path,
3217                                                      dealloc);
3218                         if (ret)
3219                                 mlog_errno(ret);
3220                         ret = 0;
3221                 }
3222         }
3223
3224 out:
3225         return ret;
3226 }
3227
3228 static void ocfs2_subtract_from_rec(struct super_block *sb,
3229                                     enum ocfs2_split_type split,
3230                                     struct ocfs2_extent_rec *rec,
3231                                     struct ocfs2_extent_rec *split_rec)
3232 {
3233         u64 len_blocks;
3234
3235         len_blocks = ocfs2_clusters_to_blocks(sb,
3236                                 le16_to_cpu(split_rec->e_leaf_clusters));
3237
3238         if (split == SPLIT_LEFT) {
3239                 /*
3240                  * Region is on the left edge of the existing
3241                  * record.
3242                  */
3243                 le32_add_cpu(&rec->e_cpos,
3244                              le16_to_cpu(split_rec->e_leaf_clusters));
3245                 le64_add_cpu(&rec->e_blkno, len_blocks);
3246                 le16_add_cpu(&rec->e_leaf_clusters,
3247                              -le16_to_cpu(split_rec->e_leaf_clusters));
3248         } else {
3249                 /*
3250                  * Region is on the right edge of the existing
3251                  * record.
3252                  */
3253                 le16_add_cpu(&rec->e_leaf_clusters,
3254                              -le16_to_cpu(split_rec->e_leaf_clusters));
3255         }
3256 }
3257
3258 /*
3259  * Do the final bits of extent record insertion at the target leaf
3260  * list. If this leaf is part of an allocation tree, it is assumed
3261  * that the tree above has been prepared.
3262  */
3263 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3264                                  struct ocfs2_extent_list *el,
3265                                  struct ocfs2_insert_type *insert,
3266                                  struct inode *inode)
3267 {
3268         int i = insert->ins_contig_index;
3269         unsigned int range;
3270         struct ocfs2_extent_rec *rec;
3271
3272         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3273
3274         if (insert->ins_split != SPLIT_NONE) {
3275                 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3276                 BUG_ON(i == -1);
3277                 rec = &el->l_recs[i];
3278                 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3279                                         insert_rec);
3280                 goto rotate;
3281         }
3282
3283         /*
3284          * Contiguous insert - either left or right.
3285          */
3286         if (insert->ins_contig != CONTIG_NONE) {
3287                 rec = &el->l_recs[i];
3288                 if (insert->ins_contig == CONTIG_LEFT) {
3289                         rec->e_blkno = insert_rec->e_blkno;
3290                         rec->e_cpos = insert_rec->e_cpos;
3291                 }
3292                 le16_add_cpu(&rec->e_leaf_clusters,
3293                              le16_to_cpu(insert_rec->e_leaf_clusters));
3294                 return;
3295         }
3296
3297         /*
3298          * Handle insert into an empty leaf.
3299          */
3300         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3301             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3302              ocfs2_is_empty_extent(&el->l_recs[0]))) {
3303                 el->l_recs[0] = *insert_rec;
3304                 el->l_next_free_rec = cpu_to_le16(1);
3305                 return;
3306         }
3307
3308         /*
3309          * Appending insert.
3310          */
3311         if (insert->ins_appending == APPEND_TAIL) {
3312                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3313                 rec = &el->l_recs[i];
3314                 range = le32_to_cpu(rec->e_cpos)
3315                         + le16_to_cpu(rec->e_leaf_clusters);
3316                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3317
3318                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3319                                 le16_to_cpu(el->l_count),
3320                                 "inode %lu, depth %u, count %u, next free %u, "
3321                                 "rec.cpos %u, rec.clusters %u, "
3322                                 "insert.cpos %u, insert.clusters %u\n",
3323                                 inode->i_ino,
3324                                 le16_to_cpu(el->l_tree_depth),
3325                                 le16_to_cpu(el->l_count),
3326                                 le16_to_cpu(el->l_next_free_rec),
3327                                 le32_to_cpu(el->l_recs[i].e_cpos),
3328                                 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3329                                 le32_to_cpu(insert_rec->e_cpos),
3330                                 le16_to_cpu(insert_rec->e_leaf_clusters));
3331                 i++;
3332                 el->l_recs[i] = *insert_rec;
3333                 le16_add_cpu(&el->l_next_free_rec, 1);
3334                 return;
3335         }
3336
3337 rotate:
3338         /*
3339          * Ok, we have to rotate.
3340          *
3341          * At this point, it is safe to assume that inserting into an
3342          * empty leaf and appending to a leaf have both been handled
3343          * above.
3344          *
3345          * This leaf needs to have space, either by the empty 1st
3346          * extent record, or by virtue of an l_next_rec < l_count.
3347          */
3348         ocfs2_rotate_leaf(el, insert_rec);
3349 }
3350
3351 static inline void ocfs2_update_dinode_clusters(struct inode *inode,
3352                                                 struct ocfs2_dinode *di,
3353                                                 u32 clusters)
3354 {
3355         le32_add_cpu(&di->i_clusters, clusters);
3356         spin_lock(&OCFS2_I(inode)->ip_lock);
3357         OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
3358         spin_unlock(&OCFS2_I(inode)->ip_lock);
3359 }
3360
3361 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3362                                            handle_t *handle,
3363                                            struct ocfs2_path *path,
3364                                            struct ocfs2_extent_rec *insert_rec)
3365 {
3366         int ret, i, next_free;
3367         struct buffer_head *bh;
3368         struct ocfs2_extent_list *el;
3369         struct ocfs2_extent_rec *rec;
3370
3371         /*
3372          * Update everything except the leaf block.
3373          */
3374         for (i = 0; i < path->p_tree_depth; i++) {
3375                 bh = path->p_node[i].bh;
3376                 el = path->p_node[i].el;
3377
3378                 next_free = le16_to_cpu(el->l_next_free_rec);
3379                 if (next_free == 0) {
3380                         ocfs2_error(inode->i_sb,
3381                                     "Dinode %llu has a bad extent list",
3382                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
3383                         ret = -EIO;
3384                         return;
3385                 }
3386
3387                 rec = &el->l_recs[next_free - 1];
3388
3389                 rec->e_int_clusters = insert_rec->e_cpos;
3390                 le32_add_cpu(&rec->e_int_clusters,
3391                              le16_to_cpu(insert_rec->e_leaf_clusters));
3392                 le32_add_cpu(&rec->e_int_clusters,
3393                              -le32_to_cpu(rec->e_cpos));
3394
3395                 ret = ocfs2_journal_dirty(handle, bh);
3396                 if (ret)
3397                         mlog_errno(ret);
3398
3399         }
3400 }
3401
3402 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3403                                     struct ocfs2_extent_rec *insert_rec,
3404                                     struct ocfs2_path *right_path,
3405                                     struct ocfs2_path **ret_left_path)
3406 {
3407         int ret, next_free;
3408         struct ocfs2_extent_list *el;
3409         struct ocfs2_path *left_path = NULL;
3410
3411         *ret_left_path = NULL;
3412
3413         /*
3414          * This shouldn't happen for non-trees. The extent rec cluster
3415          * count manipulation below only works for interior nodes.
3416          */
3417         BUG_ON(right_path->p_tree_depth == 0);
3418
3419         /*
3420          * If our appending insert is at the leftmost edge of a leaf,
3421          * then we might need to update the rightmost records of the
3422          * neighboring path.
3423          */
3424         el = path_leaf_el(right_path);
3425         next_free = le16_to_cpu(el->l_next_free_rec);
3426         if (next_free == 0 ||
3427             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3428                 u32 left_cpos;
3429
3430                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3431                                                     &left_cpos);
3432                 if (ret) {
3433                         mlog_errno(ret);
3434                         goto out;
3435                 }
3436
3437                 mlog(0, "Append may need a left path update. cpos: %u, "
3438                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3439                      left_cpos);
3440
3441                 /*
3442                  * No need to worry if the append is already in the
3443                  * leftmost leaf.
3444                  */
3445                 if (left_cpos) {
3446                         left_path = ocfs2_new_path(path_root_bh(right_path),
3447                                                    path_root_el(right_path));
3448                         if (!left_path) {
3449                                 ret = -ENOMEM;
3450                                 mlog_errno(ret);
3451                                 goto out;
3452                         }
3453
3454                         ret = ocfs2_find_path(inode, left_path, left_cpos);
3455                         if (ret) {
3456                                 mlog_errno(ret);
3457                                 goto out;
3458                         }
3459
3460                         /*
3461                          * ocfs2_insert_path() will pass the left_path to the
3462                          * journal for us.
3463                          */
3464                 }
3465         }
3466
3467         ret = ocfs2_journal_access_path(inode, handle, right_path);
3468         if (ret) {
3469                 mlog_errno(ret);
3470                 goto out;
3471         }
3472
3473         ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3474
3475         *ret_left_path = left_path;
3476         ret = 0;
3477 out:
3478         if (ret != 0)
3479                 ocfs2_free_path(left_path);
3480
3481         return ret;
3482 }
3483
3484 static void ocfs2_split_record(struct inode *inode,
3485                                struct ocfs2_path *left_path,
3486                                struct ocfs2_path *right_path,
3487                                struct ocfs2_extent_rec *split_rec,
3488                                enum ocfs2_split_type split)
3489 {
3490         int index;
3491         u32 cpos = le32_to_cpu(split_rec->e_cpos);
3492         struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3493         struct ocfs2_extent_rec *rec, *tmprec;
3494
3495         right_el = path_leaf_el(right_path);;
3496         if (left_path)
3497                 left_el = path_leaf_el(left_path);
3498
3499         el = right_el;
3500         insert_el = right_el;
3501         index = ocfs2_search_extent_list(el, cpos);
3502         if (index != -1) {
3503                 if (index == 0 && left_path) {
3504                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3505
3506                         /*
3507                          * This typically means that the record
3508                          * started in the left path but moved to the
3509                          * right as a result of rotation. We either
3510                          * move the existing record to the left, or we
3511                          * do the later insert there.
3512                          *
3513                          * In this case, the left path should always
3514                          * exist as the rotate code will have passed
3515                          * it back for a post-insert update.
3516                          */
3517
3518                         if (split == SPLIT_LEFT) {
3519                                 /*
3520                                  * It's a left split. Since we know
3521                                  * that the rotate code gave us an
3522                                  * empty extent in the left path, we
3523                                  * can just do the insert there.
3524                                  */
3525                                 insert_el = left_el;
3526                         } else {
3527                                 /*
3528                                  * Right split - we have to move the
3529                                  * existing record over to the left
3530                                  * leaf. The insert will be into the
3531                                  * newly created empty extent in the
3532                                  * right leaf.
3533                                  */
3534                                 tmprec = &right_el->l_recs[index];
3535                                 ocfs2_rotate_leaf(left_el, tmprec);
3536                                 el = left_el;
3537
3538                                 memset(tmprec, 0, sizeof(*tmprec));
3539                                 index = ocfs2_search_extent_list(left_el, cpos);
3540                                 BUG_ON(index == -1);
3541                         }
3542                 }
3543         } else {
3544                 BUG_ON(!left_path);
3545                 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3546                 /*
3547                  * Left path is easy - we can just allow the insert to
3548                  * happen.
3549                  */
3550                 el = left_el;
3551                 insert_el = left_el;
3552                 index = ocfs2_search_extent_list(el, cpos);
3553                 BUG_ON(index == -1);
3554         }
3555
3556         rec = &el->l_recs[index];
3557         ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3558         ocfs2_rotate_leaf(insert_el, split_rec);
3559 }
3560
3561 /*
3562  * This function only does inserts on an allocation b-tree. For dinode
3563  * lists, ocfs2_insert_at_leaf() is called directly.
3564  *
3565  * right_path is the path we want to do the actual insert
3566  * in. left_path should only be passed in if we need to update that
3567  * portion of the tree after an edge insert.
3568  */
3569 static int ocfs2_insert_path(struct inode *inode,
3570                              handle_t *handle,
3571                              struct ocfs2_path *left_path,
3572                              struct ocfs2_path *right_path,
3573                              struct ocfs2_extent_rec *insert_rec,
3574                              struct ocfs2_insert_type *insert)
3575 {
3576         int ret, subtree_index;
3577         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3578
3579         if (left_path) {
3580                 int credits = handle->h_buffer_credits;
3581
3582                 /*
3583                  * There's a chance that left_path got passed back to
3584                  * us without being accounted for in the
3585                  * journal. Extend our transaction here to be sure we
3586                  * can change those blocks.
3587                  */
3588                 credits += left_path->p_tree_depth;
3589
3590                 ret = ocfs2_extend_trans(handle, credits);
3591                 if (ret < 0) {
3592                         mlog_errno(ret);
3593                         goto out;
3594                 }
3595
3596                 ret = ocfs2_journal_access_path(inode, handle, left_path);
3597                 if (ret < 0) {
3598                         mlog_errno(ret);
3599                         goto out;
3600                 }
3601         }
3602
3603         /*
3604          * Pass both paths to the journal. The majority of inserts
3605          * will be touching all components anyway.
3606          */
3607         ret = ocfs2_journal_access_path(inode, handle, right_path);
3608         if (ret < 0) {
3609                 mlog_errno(ret);
3610                 goto out;
3611         }
3612
3613         if (insert->ins_split != SPLIT_NONE) {
3614                 /*
3615                  * We could call ocfs2_insert_at_leaf() for some types
3616                  * of splits, but it's easier to just let one separate
3617                  * function sort it all out.
3618                  */
3619                 ocfs2_split_record(inode, left_path, right_path,
3620                                    insert_rec, insert->ins_split);
3621
3622                 /*
3623                  * Split might have modified either leaf and we don't
3624                  * have a guarantee that the later edge insert will
3625                  * dirty this for us.
3626                  */
3627                 if (left_path)
3628                         ret = ocfs2_journal_dirty(handle,
3629                                                   path_leaf_bh(left_path));
3630                         if (ret)
3631                                 mlog_errno(ret);
3632         } else
3633                 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3634                                      insert, inode);
3635
3636         ret = ocfs2_journal_dirty(handle, leaf_bh);
3637         if (ret)
3638                 mlog_errno(ret);
3639
3640         if (left_path) {
3641                 /*
3642                  * The rotate code has indicated that we need to fix
3643                  * up portions of the tree after the insert.
3644                  *
3645                  * XXX: Should we extend the transaction here?
3646                  */
3647                 subtree_index = ocfs2_find_subtree_root(inode, left_path,
3648                                                         right_path);
3649                 ocfs2_complete_edge_insert(inode, handle, left_path,
3650                                            right_path, subtree_index);
3651         }
3652
3653         ret = 0;
3654 out:
3655         return ret;
3656 }
3657
3658 static int ocfs2_do_insert_extent(struct inode *inode,
3659                                   handle_t *handle,
3660                                   struct buffer_head *di_bh,
3661                                   struct ocfs2_extent_rec *insert_rec,
3662                                   struct ocfs2_insert_type *type)
3663 {
3664         int ret, rotate = 0;
3665         u32 cpos;
3666         struct ocfs2_path *right_path = NULL;
3667         struct ocfs2_path *left_path = NULL;
3668         struct ocfs2_dinode *di;
3669         struct ocfs2_extent_list *el;
3670
3671         di = (struct ocfs2_dinode *) di_bh->b_data;
3672         el = &di->id2.i_list;
3673
3674         ret = ocfs2_journal_access(handle, inode, di_bh,
3675                                    OCFS2_JOURNAL_ACCESS_WRITE);
3676         if (ret) {
3677                 mlog_errno(ret);
3678                 goto out;
3679         }
3680
3681         if (le16_to_cpu(el->l_tree_depth) == 0) {
3682                 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
3683                 goto out_update_clusters;
3684         }
3685
3686         right_path = ocfs2_new_inode_path(di_bh);
3687         if (!right_path) {
3688                 ret = -ENOMEM;
3689                 mlog_errno(ret);
3690                 goto out;
3691         }
3692
3693         /*
3694          * Determine the path to start with. Rotations need the
3695          * rightmost path, everything else can go directly to the
3696          * target leaf.
3697          */
3698         cpos = le32_to_cpu(insert_rec->e_cpos);
3699         if (type->ins_appending == APPEND_NONE &&
3700             type->ins_contig == CONTIG_NONE) {
3701                 rotate = 1;
3702                 cpos = UINT_MAX;
3703         }
3704
3705         ret = ocfs2_find_path(inode, right_path, cpos);
3706         if (ret) {
3707                 mlog_errno(ret);
3708                 goto out;
3709         }
3710
3711         /*
3712          * Rotations and appends need special treatment - they modify
3713          * parts of the tree's above them.
3714          *
3715          * Both might pass back a path immediate to the left of the
3716          * one being inserted to. This will be cause
3717          * ocfs2_insert_path() to modify the rightmost records of
3718          * left_path to account for an edge insert.
3719          *
3720          * XXX: When modifying this code, keep in mind that an insert
3721          * can wind up skipping both of these two special cases...
3722          */
3723         if (rotate) {
3724                 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
3725                                               le32_to_cpu(insert_rec->e_cpos),
3726                                               right_path, &left_path);
3727                 if (ret) {
3728                         mlog_errno(ret);
3729                         goto out;
3730                 }
3731
3732                 /*
3733                  * ocfs2_rotate_tree_right() might have extended the
3734                  * transaction without re-journaling our tree root.
3735                  */
3736                 ret = ocfs2_journal_access(handle, inode, di_bh,
3737                                            OCFS2_JOURNAL_ACCESS_WRITE);
3738                 if (ret) {
3739                         mlog_errno(ret);
3740                         goto out;
3741                 }
3742         } else if (type->ins_appending == APPEND_TAIL
3743                    && type->ins_contig != CONTIG_LEFT) {
3744                 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
3745                                                right_path, &left_path);
3746                 if (ret) {
3747                         mlog_errno(ret);
3748                         goto out;
3749                 }
3750         }
3751
3752         ret = ocfs2_insert_path(inode, handle, left_path, right_path,
3753                                 insert_rec, type);
3754         if (ret) {
3755                 mlog_errno(ret);
3756                 goto out;
3757         }
3758
3759 out_update_clusters:
3760         if (type->ins_split == SPLIT_NONE)
3761                 ocfs2_update_dinode_clusters(inode, di,
3762                                              le16_to_cpu(insert_rec->e_leaf_clusters));
3763
3764         ret = ocfs2_journal_dirty(handle, di_bh);
3765         if (ret)
3766                 mlog_errno(ret);
3767
3768 out:
3769         ocfs2_free_path(left_path);
3770         ocfs2_free_path(right_path);
3771
3772         return ret;
3773 }
3774
3775 static enum ocfs2_contig_type
3776 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
3777                                struct ocfs2_extent_list *el, int index,
3778                                struct ocfs2_extent_rec *split_rec)
3779 {
3780         int status;
3781         enum ocfs2_contig_type ret = CONTIG_NONE;
3782         u32 left_cpos, right_cpos;
3783         struct ocfs2_extent_rec *rec = NULL;
3784         struct ocfs2_extent_list *new_el;
3785         struct ocfs2_path *left_path = NULL, *right_path = NULL;
3786         struct buffer_head *bh;
3787         struct ocfs2_extent_block *eb;
3788
3789         if (index > 0) {
3790                 rec = &el->l_recs[index - 1];
3791         } else if (path->p_tree_depth > 0) {
3792                 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3793                                                        path, &left_cpos);
3794                 if (status)
3795                         goto out;
3796
3797                 if (left_cpos != 0) {
3798                         left_path = ocfs2_new_path(path_root_bh(path),
3799                                                    path_root_el(path));
3800                         if (!left_path)
3801                                 goto out;
3802
3803                         status = ocfs2_find_path(inode, left_path, left_cpos);
3804                         if (status)
3805                                 goto out;
3806
3807                         new_el = path_leaf_el(left_path);
3808
3809                         if (le16_to_cpu(new_el->l_next_free_rec) !=
3810                             le16_to_cpu(new_el->l_count)) {
3811                                 bh = path_leaf_bh(left_path);
3812                                 eb = (struct ocfs2_extent_block *)bh->b_data;
3813                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
3814                                                                  eb);
3815                                 goto out;
3816                         }
3817                         rec = &new_el->l_recs[
3818                                 le16_to_cpu(new_el->l_next_free_rec) - 1];
3819                 }
3820         }
3821
3822         /*
3823          * We're careful to check for an empty extent record here -
3824          * the merge code will know what to do if it sees one.
3825          */
3826         if (rec) {
3827                 if (index == 1 && ocfs2_is_empty_extent(rec)) {
3828                         if (split_rec->e_cpos == el->l_recs[index].e_cpos)
3829                                 ret = CONTIG_RIGHT;
3830                 } else {
3831                         ret = ocfs2_extent_contig(inode, rec, split_rec);
3832                 }
3833         }
3834
3835         rec = NULL;
3836         if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
3837                 rec = &el->l_recs[index + 1];
3838         else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
3839                  path->p_tree_depth > 0) {
3840                 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
3841                                                         path, &right_cpos);
3842                 if (status)
3843                         goto out;
3844
3845                 if (right_cpos == 0)
3846                         goto out;
3847
3848                 right_path = ocfs2_new_path(path_root_bh(path),
3849                                             path_root_el(path));
3850                 if (!right_path)
3851                         goto out;
3852
3853                 status = ocfs2_find_path(inode, right_path, right_cpos);
3854                 if (status)
3855                         goto out;
3856
3857                 new_el = path_leaf_el(right_path);
3858                 rec = &new_el->l_recs[0];
3859                 if (ocfs2_is_empty_extent(rec)) {
3860                         if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
3861                                 bh = path_leaf_bh(right_path);
3862                                 eb = (struct ocfs2_extent_block *)bh->b_data;
3863                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
3864                                                                  eb);
3865                                 goto out;
3866                         }
3867                         rec = &new_el->l_recs[1];
3868                 }
3869         }
3870
3871         if (rec) {
3872                 enum ocfs2_contig_type contig_type;
3873
3874                 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
3875
3876                 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
3877                         ret = CONTIG_LEFTRIGHT;
3878                 else if (ret == CONTIG_NONE)
3879                         ret = contig_type;
3880         }
3881
3882 out:
3883         if (left_path)
3884                 ocfs2_free_path(left_path);
3885         if (right_path)
3886                 ocfs2_free_path(right_path);
3887
3888         return ret;
3889 }
3890
3891 static void ocfs2_figure_contig_type(struct inode *inode,
3892                                      struct ocfs2_insert_type *insert,
3893                                      struct ocfs2_extent_list *el,
3894                                      struct ocfs2_extent_rec *insert_rec)
3895 {
3896         int i;
3897         enum ocfs2_contig_type contig_type = CONTIG_NONE;
3898
3899         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3900
3901         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
3902                 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
3903                                                   insert_rec);
3904                 if (contig_type != CONTIG_NONE) {
3905                         insert->ins_contig_index = i;
3906                         break;
3907                 }
3908         }
3909         insert->ins_contig = contig_type;
3910 }
3911
3912 /*
3913  * This should only be called against the righmost leaf extent list.
3914  *
3915  * ocfs2_figure_appending_type() will figure out whether we'll have to
3916  * insert at the tail of the rightmost leaf.
3917  *
3918  * This should also work against the dinode list for tree's with 0
3919  * depth. If we consider the dinode list to be the rightmost leaf node
3920  * then the logic here makes sense.
3921  */
3922 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
3923                                         struct ocfs2_extent_list *el,
3924                                         struct ocfs2_extent_rec *insert_rec)
3925 {
3926         int i;
3927         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
3928         struct ocfs2_extent_rec *rec;
3929
3930         insert->ins_appending = APPEND_NONE;
3931
3932         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3933
3934         if (!el->l_next_free_rec)
3935                 goto set_tail_append;
3936
3937         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
3938                 /* Were all records empty? */
3939                 if (le16_to_cpu(el->l_next_free_rec) == 1)
3940                         goto set_tail_append;
3941         }
3942
3943         i = le16_to_cpu(el->l_next_free_rec) - 1;
3944         rec = &el->l_recs[i];
3945
3946         if (cpos >=
3947             (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
3948                 goto set_tail_append;
3949
3950         return;
3951
3952 set_tail_append:
3953         insert->ins_appending = APPEND_TAIL;
3954 }
3955
3956 /*
3957  * Helper function called at the begining of an insert.
3958  *
3959  * This computes a few things that are commonly used in the process of
3960  * inserting into the btree:
3961  *   - Whether the new extent is contiguous with an existing one.
3962  *   - The current tree depth.
3963  *   - Whether the insert is an appending one.
3964  *   - The total # of free records in the tree.
3965  *
3966  * All of the information is stored on the ocfs2_insert_type
3967  * structure.
3968  */
3969 static int ocfs2_figure_insert_type(struct inode *inode,
3970                                     struct buffer_head *di_bh,
3971                                     struct buffer_head **last_eb_bh,
3972                                     struct ocfs2_extent_rec *insert_rec,
3973                                     int *free_records,
3974                                     struct ocfs2_insert_type *insert)
3975 {
3976         int ret;
3977         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3978         struct ocfs2_extent_block *eb;
3979         struct ocfs2_extent_list *el;
3980         struct ocfs2_path *path = NULL;
3981         struct buffer_head *bh = NULL;
3982
3983         insert->ins_split = SPLIT_NONE;
3984
3985         el = &di->id2.i_list;
3986         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
3987
3988         if (el->l_tree_depth) {
3989                 /*
3990                  * If we have tree depth, we read in the
3991                  * rightmost extent block ahead of time as
3992                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
3993                  * may want it later.
3994                  */
3995                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
3996                                        le64_to_cpu(di->i_last_eb_blk), &bh,
3997                                        OCFS2_BH_CACHED, inode);
3998                 if (ret) {
3999                         mlog_exit(ret);
4000                         goto out;
4001                 }
4002                 eb = (struct ocfs2_extent_block *) bh->b_data;
4003                 el = &eb->h_list;
4004         }
4005
4006         /*
4007          * Unless we have a contiguous insert, we'll need to know if
4008          * there is room left in our allocation tree for another
4009          * extent record.
4010          *
4011          * XXX: This test is simplistic, we can search for empty
4012          * extent records too.
4013          */
4014         *free_records = le16_to_cpu(el->l_count) -
4015                 le16_to_cpu(el->l_next_free_rec);
4016
4017         if (!insert->ins_tree_depth) {
4018                 ocfs2_figure_contig_type(inode, insert, el, insert_rec);
4019                 ocfs2_figure_appending_type(insert, el, insert_rec);
4020                 return 0;
4021         }
4022
4023         path = ocfs2_new_inode_path(di_bh);
4024         if (!path) {
4025                 ret = -ENOMEM;
4026                 mlog_errno(ret);
4027                 goto out;
4028         }
4029
4030         /*
4031          * In the case that we're inserting past what the tree
4032          * currently accounts for, ocfs2_find_path() will return for
4033          * us the rightmost tree path. This is accounted for below in
4034          * the appending code.
4035          */
4036         ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4037         if (ret) {
4038                 mlog_errno(ret);
4039                 goto out;
4040         }
4041
4042         el = path_leaf_el(path);
4043
4044         /*
4045          * Now that we have the path, there's two things we want to determine:
4046          * 1) Contiguousness (also set contig_index if this is so)
4047          *
4048          * 2) Are we doing an append? We can trivially break this up
4049          *     into two types of appends: simple record append, or a
4050          *     rotate inside the tail leaf.
4051          */
4052         ocfs2_figure_contig_type(inode, insert, el, insert_rec);
4053
4054         /*
4055          * The insert code isn't quite ready to deal with all cases of
4056          * left contiguousness. Specifically, if it's an insert into
4057          * the 1st record in a leaf, it will require the adjustment of
4058          * cluster count on the last record of the path directly to it's
4059          * left. For now, just catch that case and fool the layers
4060          * above us. This works just fine for tree_depth == 0, which
4061          * is why we allow that above.
4062          */
4063         if (insert->ins_contig == CONTIG_LEFT &&
4064             insert->ins_contig_index == 0)
4065                 insert->ins_contig = CONTIG_NONE;
4066
4067         /*
4068          * Ok, so we can simply compare against last_eb to figure out
4069          * whether the path doesn't exist. This will only happen in
4070          * the case that we're doing a tail append, so maybe we can
4071          * take advantage of that information somehow.
4072          */
4073         if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
4074                 /*
4075                  * Ok, ocfs2_find_path() returned us the rightmost
4076                  * tree path. This might be an appending insert. There are
4077                  * two cases:
4078                  *    1) We're doing a true append at the tail:
4079                  *      -This might even be off the end of the leaf
4080                  *    2) We're "appending" by rotating in the tail
4081                  */
4082                 ocfs2_figure_appending_type(insert, el, insert_rec);
4083         }
4084
4085 out:
4086         ocfs2_free_path(path);
4087
4088         if (ret == 0)
4089                 *last_eb_bh = bh;
4090         else
4091                 brelse(bh);
4092         return ret;
4093 }
4094
4095 /*
4096  * Insert an extent into an inode btree.
4097  *
4098  * The caller needs to update fe->i_clusters
4099  */
4100 int ocfs2_insert_extent(struct ocfs2_super *osb,
4101                         handle_t *handle,
4102                         struct inode *inode,
4103                         struct buffer_head *fe_bh,
4104                         u32 cpos,
4105                         u64 start_blk,
4106                         u32 new_clusters,
4107                         u8 flags,
4108                         struct ocfs2_alloc_context *meta_ac)
4109 {
4110         int status;
4111         int uninitialized_var(free_records);
4112         struct buffer_head *last_eb_bh = NULL;
4113         struct ocfs2_insert_type insert = {0, };
4114         struct ocfs2_extent_rec rec;
4115
4116         BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
4117
4118         mlog(0, "add %u clusters at position %u to inode %llu\n",
4119              new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4120
4121         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
4122                         (OCFS2_I(inode)->ip_clusters != cpos),
4123                         "Device %s, asking for sparse allocation: inode %llu, "
4124                         "cpos %u, clusters %u\n",
4125                         osb->dev_str,
4126                         (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
4127                         OCFS2_I(inode)->ip_clusters);
4128
4129         memset(&rec, 0, sizeof(rec));
4130         rec.e_cpos = cpu_to_le32(cpos);
4131         rec.e_blkno = cpu_to_le64(start_blk);
4132         rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4133         rec.e_flags = flags;
4134
4135         status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
4136                                           &free_records, &insert);
4137         if (status < 0) {
4138                 mlog_errno(status);
4139                 goto bail;
4140         }
4141
4142         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4143              "Insert.contig_index: %d, Insert.free_records: %d, "
4144              "Insert.tree_depth: %d\n",
4145              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4146              free_records, insert.ins_tree_depth);
4147
4148         if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4149                 status = ocfs2_grow_tree(inode, handle, fe_bh,
4150                                          &insert.ins_tree_depth, &last_eb_bh,
4151                                          meta_ac);
4152                 if (status) {
4153                         mlog_errno(status);
4154                         goto bail;
4155                 }
4156         }
4157
4158         /* Finally, we can add clusters. This might rotate the tree for us. */
4159         status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
4160         if (status < 0)
4161                 mlog_errno(status);
4162         else
4163                 ocfs2_extent_map_insert_rec(inode, &rec);
4164
4165 bail:
4166         if (last_eb_bh)
4167                 brelse(last_eb_bh);
4168
4169         mlog_exit(status);
4170         return status;
4171 }
4172
4173 static void ocfs2_make_right_split_rec(struct super_block *sb,
4174                                        struct ocfs2_extent_rec *split_rec,
4175                                        u32 cpos,
4176                                        struct ocfs2_extent_rec *rec)
4177 {
4178         u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4179         u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4180
4181         memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4182
4183         split_rec->e_cpos = cpu_to_le32(cpos);
4184         split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4185
4186         split_rec->e_blkno = rec->e_blkno;
4187         le64_add_cpu(&split_rec->e_blkno,
4188                      ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4189
4190         split_rec->e_flags = rec->e_flags;
4191 }
4192
4193 static int ocfs2_split_and_insert(struct inode *inode,
4194                                   handle_t *handle,
4195                                   struct ocfs2_path *path,
4196                                   struct buffer_head *di_bh,
4197                                   struct buffer_head **last_eb_bh,
4198                                   int split_index,
4199                                   struct ocfs2_extent_rec *orig_split_rec,
4200                                   struct ocfs2_alloc_context *meta_ac)
4201 {
4202         int ret = 0, depth;
4203         unsigned int insert_range, rec_range, do_leftright = 0;
4204         struct ocfs2_extent_rec tmprec;
4205         struct ocfs2_extent_list *rightmost_el;
4206         struct ocfs2_extent_rec rec;
4207         struct ocfs2_extent_rec split_rec = *orig_split_rec;
4208         struct ocfs2_insert_type insert;
4209         struct ocfs2_extent_block *eb;
4210         struct ocfs2_dinode *di;
4211
4212 leftright:
4213         /*
4214          * Store a copy of the record on the stack - it might move
4215          * around as the tree is manipulated below.
4216          */
4217         rec = path_leaf_el(path)->l_recs[split_index];
4218
4219         di = (struct ocfs2_dinode *)di_bh->b_data;
4220         rightmost_el = &di->id2.i_list;
4221
4222         depth = le16_to_cpu(rightmost_el->l_tree_depth);
4223         if (depth) {
4224                 BUG_ON(!(*last_eb_bh));
4225                 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4226                 rightmost_el = &eb->h_list;
4227         }
4228
4229         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4230             le16_to_cpu(rightmost_el->l_count)) {
4231                 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
4232                                       meta_ac);
4233                 if (ret) {
4234                         mlog_errno(ret);
4235                         goto out;
4236                 }
4237         }
4238
4239         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4240         insert.ins_appending = APPEND_NONE;
4241         insert.ins_contig = CONTIG_NONE;
4242         insert.ins_tree_depth = depth;
4243
4244         insert_range = le32_to_cpu(split_rec.e_cpos) +
4245                 le16_to_cpu(split_rec.e_leaf_clusters);
4246         rec_range = le32_to_cpu(rec.e_cpos) +
4247                 le16_to_cpu(rec.e_leaf_clusters);
4248
4249         if (split_rec.e_cpos == rec.e_cpos) {
4250                 insert.ins_split = SPLIT_LEFT;
4251         } else if (insert_range == rec_range) {
4252                 insert.ins_split = SPLIT_RIGHT;
4253         } else {
4254                 /*
4255                  * Left/right split. We fake this as a right split
4256                  * first and then make a second pass as a left split.
4257                  */
4258                 insert.ins_split = SPLIT_RIGHT;
4259
4260                 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4261                                            &rec);
4262
4263                 split_rec = tmprec;
4264
4265                 BUG_ON(do_leftright);
4266                 do_leftright = 1;
4267         }
4268
4269         ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
4270                                      &insert);
4271         if (ret) {
4272                 mlog_errno(ret);
4273                 goto out;
4274         }
4275
4276         if (do_leftright == 1) {
4277                 u32 cpos;
4278                 struct ocfs2_extent_list *el;
4279
4280                 do_leftright++;
4281                 split_rec = *orig_split_rec;
4282
4283                 ocfs2_reinit_path(path, 1);
4284
4285                 cpos = le32_to_cpu(split_rec.e_cpos);
4286                 ret = ocfs2_find_path(inode, path, cpos);
4287                 if (ret) {
4288                         mlog_errno(ret);
4289                         goto out;
4290                 }
4291
4292                 el = path_leaf_el(path);
4293                 split_index = ocfs2_search_extent_list(el, cpos);
4294                 goto leftright;
4295         }
4296 out:
4297
4298         return ret;
4299 }
4300
4301 /*
4302  * Mark part or all of the extent record at split_index in the leaf
4303  * pointed to by path as written. This removes the unwritten
4304  * extent flag.
4305  *
4306  * Care is taken to handle contiguousness so as to not grow the tree.
4307  *
4308  * meta_ac is not strictly necessary - we only truly need it if growth
4309  * of the tree is required. All other cases will degrade into a less
4310  * optimal tree layout.
4311  *
4312  * last_eb_bh should be the rightmost leaf block for any inode with a
4313  * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
4314  *
4315  * This code is optimized for readability - several passes might be
4316  * made over certain portions of the tree. All of those blocks will
4317  * have been brought into cache (and pinned via the journal), so the
4318  * extra overhead is not expressed in terms of disk reads.
4319  */
4320 static int __ocfs2_mark_extent_written(struct inode *inode,
4321                                        struct buffer_head *di_bh,
4322                                        handle_t *handle,
4323                                        struct ocfs2_path *path,
4324                                        int split_index,
4325                                        struct ocfs2_extent_rec *split_rec,
4326                                        struct ocfs2_alloc_context *meta_ac,
4327                                        struct ocfs2_cached_dealloc_ctxt *dealloc)
4328 {
4329         int ret = 0;
4330         struct ocfs2_extent_list *el = path_leaf_el(path);
4331         struct buffer_head *last_eb_bh = NULL;
4332         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4333         struct ocfs2_merge_ctxt ctxt;
4334         struct ocfs2_extent_list *rightmost_el;
4335
4336         if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4337                 ret = -EIO;
4338                 mlog_errno(ret);
4339                 goto out;
4340         }
4341
4342         if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4343             ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4344              (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4345                 ret = -EIO;
4346                 mlog_errno(ret);
4347                 goto out;
4348         }
4349
4350         ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4351                                                             split_index,
4352                                                             split_rec);
4353
4354         /*
4355          * The core merge / split code wants to know how much room is
4356          * left in this inodes allocation tree, so we pass the
4357          * rightmost extent list.
4358          */
4359         if (path->p_tree_depth) {
4360                 struct ocfs2_extent_block *eb;
4361                 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4362
4363                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4364                                        le64_to_cpu(di->i_last_eb_blk),
4365                                        &last_eb_bh, OCFS2_BH_CACHED, inode);
4366                 if (ret) {
4367                         mlog_exit(ret);
4368                         goto out;
4369                 }
4370
4371                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4372                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4373                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4374                         ret = -EROFS;
4375                         goto out;
4376                 }
4377
4378                 rightmost_el = &eb->h_list;
4379         } else
4380                 rightmost_el = path_root_el(path);
4381
4382         if (rec->e_cpos == split_rec->e_cpos &&
4383             rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4384                 ctxt.c_split_covers_rec = 1;
4385         else
4386                 ctxt.c_split_covers_rec = 0;
4387
4388         ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4389
4390         mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4391              split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4392              ctxt.c_split_covers_rec);
4393
4394         if (ctxt.c_contig_type == CONTIG_NONE) {
4395                 if (ctxt.c_split_covers_rec)
4396                         el->l_recs[split_index] = *split_rec;
4397                 else
4398                         ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
4399                                                      &last_eb_bh, split_index,
4400                                                      split_rec, meta_ac);
4401                 if (ret)
4402                         mlog_errno(ret);
4403         } else {
4404                 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4405                                                 split_index, split_rec,
4406                                                 dealloc, &ctxt);
4407                 if (ret)
4408                         mlog_errno(ret);
4409         }
4410
4411 out:
4412         brelse(last_eb_bh);
4413         return ret;
4414 }
4415
4416 /*
4417  * Mark the already-existing extent at cpos as written for len clusters.
4418  *
4419  * If the existing extent is larger than the request, initiate a
4420  * split. An attempt will be made at merging with adjacent extents.
4421  *
4422  * The caller is responsible for passing down meta_ac if we'll need it.
4423  */
4424 int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
4425                               handle_t *handle, u32 cpos, u32 len, u32 phys,
4426                               struct ocfs2_alloc_context *meta_ac,
4427                               struct ocfs2_cached_dealloc_ctxt *dealloc)
4428 {
4429         int ret, index;
4430         u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4431         struct ocfs2_extent_rec split_rec;
4432         struct ocfs2_path *left_path = NULL;
4433         struct ocfs2_extent_list *el;
4434
4435         mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4436              inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4437
4438         if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4439                 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4440                             "that are being written to, but the feature bit "
4441                             "is not set in the super block.",
4442                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
4443                 ret = -EROFS;
4444                 goto out;
4445         }
4446
4447         /*
4448          * XXX: This should be fixed up so that we just re-insert the
4449          * next extent records.
4450          */
4451         ocfs2_extent_map_trunc(inode, 0);
4452
4453         left_path = ocfs2_new_inode_path(di_bh);
4454         if (!left_path) {
4455                 ret = -ENOMEM;
4456                 mlog_errno(ret);
4457                 goto out;
4458         }
4459
4460         ret = ocfs2_find_path(inode, left_path, cpos);
4461         if (ret) {
4462                 mlog_errno(ret);
4463                 goto out;
4464         }
4465         el = path_leaf_el(left_path);
4466
4467         index = ocfs2_search_extent_list(el, cpos);
4468         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4469                 ocfs2_error(inode->i_sb,
4470                             "Inode %llu has an extent at cpos %u which can no "
4471                             "longer be found.\n",
4472                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4473                 ret = -EROFS;
4474                 goto out;
4475         }
4476
4477         memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4478         split_rec.e_cpos = cpu_to_le32(cpos);
4479         split_rec.e_leaf_clusters = cpu_to_le16(len);
4480         split_rec.e_blkno = cpu_to_le64(start_blkno);
4481         split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4482         split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4483
4484         ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
4485                                           index, &split_rec, meta_ac, dealloc);
4486         if (ret)
4487                 mlog_errno(ret);
4488
4489 out:
4490         ocfs2_free_path(left_path);
4491         return ret;
4492 }
4493
4494 static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4495                             handle_t *handle, struct ocfs2_path *path,
4496                             int index, u32 new_range,
4497                             struct ocfs2_alloc_context *meta_ac)
4498 {
4499         int ret, depth, credits = handle->h_buffer_credits;
4500         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4501         struct buffer_head *last_eb_bh = NULL;
4502         struct ocfs2_extent_block *eb;
4503         struct ocfs2_extent_list *rightmost_el, *el;
4504         struct ocfs2_extent_rec split_rec;
4505         struct ocfs2_extent_rec *rec;
4506         struct ocfs2_insert_type insert;
4507
4508         /*
4509          * Setup the record to split before we grow the tree.
4510          */
4511         el = path_leaf_el(path);
4512         rec = &el->l_recs[index];
4513         ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4514
4515         depth = path->p_tree_depth;
4516         if (depth > 0) {
4517                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4518                                        le64_to_cpu(di->i_last_eb_blk),
4519                                        &last_eb_bh, OCFS2_BH_CACHED, inode);
4520                 if (ret < 0) {
4521                         mlog_errno(ret);
4522                         goto out;
4523                 }
4524
4525                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4526                 rightmost_el = &eb->h_list;
4527         } else
4528                 rightmost_el = path_leaf_el(path);
4529
4530         credits += path->p_tree_depth +
4531                    ocfs2_extend_meta_needed(&di->id2.i_list);
4532         ret = ocfs2_extend_trans(handle, credits);
4533         if (ret) {
4534                 mlog_errno(ret);
4535                 goto out;
4536         }
4537
4538         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4539             le16_to_cpu(rightmost_el->l_count)) {
4540                 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
4541                                       meta_ac);
4542                 if (ret) {
4543                         mlog_errno(ret);
4544                         goto out;
4545                 }
4546         }
4547
4548         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4549         insert.ins_appending = APPEND_NONE;
4550         insert.ins_contig = CONTIG_NONE;
4551         insert.ins_split = SPLIT_RIGHT;
4552         insert.ins_tree_depth = depth;
4553
4554         ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
4555         if (ret)
4556                 mlog_errno(ret);
4557
4558 out:
4559         brelse(last_eb_bh);
4560         return ret;
4561 }
4562
4563 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4564                               struct ocfs2_path *path, int index,
4565                               struct ocfs2_cached_dealloc_ctxt *dealloc,
4566                               u32 cpos, u32 len)
4567 {
4568         int ret;
4569         u32 left_cpos, rec_range, trunc_range;
4570         int wants_rotate = 0, is_rightmost_tree_rec = 0;
4571         struct super_block *sb = inode->i_sb;
4572         struct ocfs2_path *left_path = NULL;
4573         struct ocfs2_extent_list *el = path_leaf_el(path);
4574         struct ocfs2_extent_rec *rec;
4575         struct ocfs2_extent_block *eb;
4576
4577         if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
4578                 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4579                 if (ret) {
4580                         mlog_errno(ret);
4581                         goto out;
4582                 }
4583
4584                 index--;
4585         }
4586
4587         if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
4588             path->p_tree_depth) {
4589                 /*
4590                  * Check whether this is the rightmost tree record. If
4591                  * we remove all of this record or part of its right
4592                  * edge then an update of the record lengths above it
4593                  * will be required.
4594                  */
4595                 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
4596                 if (eb->h_next_leaf_blk == 0)
4597                         is_rightmost_tree_rec = 1;
4598         }
4599
4600         rec = &el->l_recs[index];
4601         if (index == 0 && path->p_tree_depth &&
4602             le32_to_cpu(rec->e_cpos) == cpos) {
4603                 /*
4604                  * Changing the leftmost offset (via partial or whole
4605                  * record truncate) of an interior (or rightmost) path
4606                  * means we have to update the subtree that is formed
4607                  * by this leaf and the one to it's left.
4608                  *
4609                  * There are two cases we can skip:
4610                  *   1) Path is the leftmost one in our inode tree.
4611                  *   2) The leaf is rightmost and will be empty after
4612                  *      we remove the extent record - the rotate code
4613                  *      knows how to update the newly formed edge.
4614                  */
4615
4616                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
4617                                                     &left_cpos);
4618                 if (ret) {
4619                         mlog_errno(ret);
4620                         goto out;
4621                 }
4622
4623                 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
4624                         left_path = ocfs2_new_path(path_root_bh(path),
4625                                                    path_root_el(path));
4626                         if (!left_path) {
4627                                 ret = -ENOMEM;
4628                                 mlog_errno(ret);
4629                                 goto out;
4630                         }
4631
4632                         ret = ocfs2_find_path(inode, left_path, left_cpos);
4633                         if (ret) {
4634                                 mlog_errno(ret);
4635                                 goto out;
4636                         }
4637                 }
4638         }
4639
4640         ret = ocfs2_extend_rotate_transaction(handle, 0,
4641                                               handle->h_buffer_credits,
4642                                               path);
4643         if (ret) {
4644                 mlog_errno(ret);
4645                 goto out;
4646         }
4647
4648         ret = ocfs2_journal_access_path(inode, handle, path);
4649         if (ret) {
4650                 mlog_errno(ret);
4651                 goto out;
4652         }
4653
4654         ret = ocfs2_journal_access_path(inode, handle, left_path);
4655         if (ret) {
4656                 mlog_errno(ret);
4657                 goto out;
4658         }
4659
4660         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4661         trunc_range = cpos + len;
4662
4663         if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
4664                 int next_free;
4665
4666                 memset(rec, 0, sizeof(*rec));
4667                 ocfs2_cleanup_merge(el, index);
4668                 wants_rotate = 1;
4669
4670                 next_free = le16_to_cpu(el->l_next_free_rec);
4671                 if (is_rightmost_tree_rec && next_free > 1) {
4672                         /*
4673                          * We skip the edge update if this path will
4674                          * be deleted by the rotate code.
4675                          */
4676                         rec = &el->l_recs[next_free - 1];
4677                         ocfs2_adjust_rightmost_records(inode, handle, path,
4678                                                        rec);
4679                 }
4680         } else if (le32_to_cpu(rec->e_cpos) == cpos) {
4681                 /* Remove leftmost portion of the record. */
4682                 le32_add_cpu(&rec->e_cpos, len);
4683                 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
4684                 le16_add_cpu(&rec->e_leaf_clusters, -len);
4685         } else if (rec_range == trunc_range) {
4686                 /* Remove rightmost portion of the record */
4687                 le16_add_cpu(&rec->e_leaf_clusters, -len);
4688                 if (is_rightmost_tree_rec)
4689                         ocfs2_adjust_rightmost_records(inode, handle, path, rec);
4690         } else {
4691                 /* Caller should have trapped this. */
4692                 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
4693                      "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
4694                      le32_to_cpu(rec->e_cpos),
4695                      le16_to_cpu(rec->e_leaf_clusters), cpos, len);
4696                 BUG();
4697         }
4698
4699         if (left_path) {
4700                 int subtree_index;
4701
4702                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
4703                 ocfs2_complete_edge_insert(inode, handle, left_path, path,
4704                                            subtree_index);
4705         }
4706
4707         ocfs2_journal_dirty(handle, path_leaf_bh(path));
4708
4709         ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4710         if (ret) {
4711                 mlog_errno(ret);
4712                 goto out;
4713         }
4714
4715 out:
4716         ocfs2_free_path(left_path);
4717         return ret;
4718 }
4719
4720 int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
4721                         u32 cpos, u32 len, handle_t *handle,
4722                         struct ocfs2_alloc_context *meta_ac,
4723                         struct ocfs2_cached_dealloc_ctxt *dealloc)
4724 {
4725         int ret, index;
4726         u32 rec_range, trunc_range;
4727         struct ocfs2_extent_rec *rec;
4728         struct ocfs2_extent_list *el;
4729         struct ocfs2_path *path;
4730
4731         ocfs2_extent_map_trunc(inode, 0);
4732
4733         path = ocfs2_new_inode_path(di_bh);
4734         if (!path) {
4735                 ret = -ENOMEM;
4736                 mlog_errno(ret);
4737                 goto out;
4738         }
4739
4740         ret = ocfs2_find_path(inode, path, cpos);
4741         if (ret) {
4742                 mlog_errno(ret);
4743                 goto out;
4744         }
4745
4746         el = path_leaf_el(path);
4747         index = ocfs2_search_extent_list(el, cpos);
4748         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4749                 ocfs2_error(inode->i_sb,
4750                             "Inode %llu has an extent at cpos %u which can no "
4751                             "longer be found.\n",
4752                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4753                 ret = -EROFS;
4754                 goto out;
4755         }
4756
4757         /*
4758          * We have 3 cases of extent removal:
4759          *   1) Range covers the entire extent rec
4760          *   2) Range begins or ends on one edge of the extent rec
4761          *   3) Range is in the middle of the extent rec (no shared edges)
4762          *
4763          * For case 1 we remove the extent rec and left rotate to
4764          * fill the hole.
4765          *
4766          * For case 2 we just shrink the existing extent rec, with a
4767          * tree update if the shrinking edge is also the edge of an
4768          * extent block.
4769          *
4770          * For case 3 we do a right split to turn the extent rec into
4771          * something case 2 can handle.
4772          */
4773         rec = &el->l_recs[index];
4774         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4775         trunc_range = cpos + len;
4776
4777         BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
4778
4779         mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
4780              "(cpos %u, len %u)\n",
4781              (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
4782              le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
4783
4784         if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
4785                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4786                                          cpos, len);
4787                 if (ret) {
4788                         mlog_errno(ret);
4789                         goto out;
4790                 }
4791         } else {
4792                 ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
4793                                        trunc_range, meta_ac);
4794                 if (ret) {
4795                         mlog_errno(ret);
4796                         goto out;
4797                 }
4798
4799                 /*
4800                  * The split could have manipulated the tree enough to
4801                  * move the record location, so we have to look for it again.
4802                  */
4803                 ocfs2_reinit_path(path, 1);
4804
4805                 ret = ocfs2_find_path(inode, path, cpos);
4806                 if (ret) {
4807                         mlog_errno(ret);
4808                         goto out;
4809                 }
4810
4811                 el = path_leaf_el(path);
4812                 index = ocfs2_search_extent_list(el, cpos);
4813                 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4814                         ocfs2_error(inode->i_sb,
4815                                     "Inode %llu: split at cpos %u lost record.",
4816                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
4817                                     cpos);
4818                         ret = -EROFS;
4819                         goto out;
4820                 }
4821
4822                 /*
4823                  * Double check our values here. If anything is fishy,
4824                  * it's easier to catch it at the top level.
4825                  */
4826                 rec = &el->l_recs[index];
4827                 rec_range = le32_to_cpu(rec->e_cpos) +
4828                         ocfs2_rec_clusters(el, rec);
4829                 if (rec_range != trunc_range) {
4830                         ocfs2_error(inode->i_sb,
4831                                     "Inode %llu: error after split at cpos %u"
4832                                     "trunc len %u, existing record is (%u,%u)",
4833                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
4834                                     cpos, len, le32_to_cpu(rec->e_cpos),
4835                                     ocfs2_rec_clusters(el, rec));
4836                         ret = -EROFS;
4837                         goto out;
4838                 }
4839
4840                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4841                                          cpos, len);
4842                 if (ret) {
4843                         mlog_errno(ret);
4844                         goto out;
4845                 }
4846         }
4847
4848 out:
4849         ocfs2_free_path(path);
4850         return ret;
4851 }
4852
4853 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
4854 {
4855         struct buffer_head *tl_bh = osb->osb_tl_bh;
4856         struct ocfs2_dinode *di;
4857         struct ocfs2_truncate_log *tl;
4858
4859         di = (struct ocfs2_dinode *) tl_bh->b_data;
4860         tl = &di->id2.i_dealloc;
4861
4862         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
4863                         "slot %d, invalid truncate log parameters: used = "
4864                         "%u, count = %u\n", osb->slot_num,
4865                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
4866         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
4867 }
4868
4869 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
4870                                            unsigned int new_start)
4871 {
4872         unsigned int tail_index;
4873         unsigned int current_tail;
4874
4875         /* No records, nothing to coalesce */
4876         if (!le16_to_cpu(tl->tl_used))
4877                 return 0;
4878
4879         tail_index = le16_to_cpu(tl->tl_used) - 1;
4880         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
4881         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
4882
4883         return current_tail == new_start;
4884 }
4885
4886 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
4887                               handle_t *handle,
4888                               u64 start_blk,
4889                               unsigned int num_clusters)
4890 {
4891         int status, index;
4892         unsigned int start_cluster, tl_count;
4893         struct inode *tl_inode = osb->osb_tl_inode;
4894         struct buffer_head *tl_bh = osb->osb_tl_bh;
4895         struct ocfs2_dinode *di;
4896         struct ocfs2_truncate_log *tl;
4897
4898         mlog_entry("start_blk = %llu, num_clusters = %u\n",
4899                    (unsigned long long)start_blk, num_clusters);
4900
4901         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
4902
4903         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
4904
4905         di = (struct ocfs2_dinode *) tl_bh->b_data;
4906         tl = &di->id2.i_dealloc;
4907         if (!OCFS2_IS_VALID_DINODE(di)) {
4908                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
4909                 status = -EIO;
4910                 goto bail;
4911         }
4912
4913         tl_count = le16_to_cpu(tl->tl_count);
4914         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
4915                         tl_count == 0,
4916                         "Truncate record count on #%llu invalid "
4917                         "wanted %u, actual %u\n",
4918                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
4919                         ocfs2_truncate_recs_per_inode(osb->sb),
4920                         le16_to_cpu(tl->tl_count));
4921
4922         /* Caller should have known to flush before calling us. */
4923         index = le16_to_cpu(tl->tl_used);
4924         if (index >= tl_count) {
4925                 status = -ENOSPC;
4926                 mlog_errno(status);
4927                 goto bail;
4928         }
4929
4930         status = ocfs2_journal_access(handle, tl_inode, tl_bh,
4931                                       OCFS2_JOURNAL_ACCESS_WRITE);
4932         if (status < 0) {
4933                 mlog_errno(status);
4934                 goto bail;
4935         }
4936
4937         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
4938              "%llu (index = %d)\n", num_clusters, start_cluster,
4939              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
4940
4941         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
4942                 /*
4943                  * Move index back to the record we are coalescing with.
4944                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
4945                  */
4946                 index--;
4947
4948                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
4949                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
4950                      index, le32_to_cpu(tl->tl_recs[index].t_start),
4951                      num_clusters);
4952         } else {
4953                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
4954                 tl->tl_used = cpu_to_le16(index + 1);
4955         }
4956         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
4957
4958         status = ocfs2_journal_dirty(handle, tl_bh);
4959         if (status < 0) {
4960                 mlog_errno(status);
4961                 goto bail;
4962         }
4963
4964 bail:
4965         mlog_exit(status);
4966         return status;
4967 }
4968
4969 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
4970                                          handle_t *handle,
4971                                          struct inode *data_alloc_inode,
4972                                          struct buffer_head *data_alloc_bh)
4973 {
4974         int status = 0;
4975         int i;
4976         unsigned int num_clusters;
4977         u64 start_blk;
4978         struct ocfs2_truncate_rec rec;
4979         struct ocfs2_dinode *di;
4980         struct ocfs2_truncate_log *tl;
4981         struct inode *tl_inode = osb->osb_tl_inode;
4982         struct buffer_head *tl_bh = osb->osb_tl_bh;
4983
4984         mlog_entry_void();
4985
4986         di = (struct ocfs2_dinode *) tl_bh->b_data;
4987         tl = &di->id2.i_dealloc;
4988         i = le16_to_cpu(tl->tl_used) - 1;
4989         while (i >= 0) {
4990                 /* Caller has given us at least enough credits to
4991                  * update the truncate log dinode */
4992                 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
4993                                               OCFS2_JOURNAL_ACCESS_WRITE);
4994                 if (status < 0) {
4995                         mlog_errno(status);
4996                         goto bail;
4997                 }
4998
4999                 tl->tl_used = cpu_to_le16(i);
5000
5001                 status = ocfs2_journal_dirty(handle, tl_bh);
5002                 if (status < 0) {
5003                         mlog_errno(status);
5004                         goto bail;
5005                 }
5006
5007                 /* TODO: Perhaps we can calculate the bulk of the
5008                  * credits up front rather than extending like
5009                  * this. */
5010                 status = ocfs2_extend_trans(handle,
5011                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5012                 if (status < 0) {
5013                         mlog_errno(status);
5014                         goto bail;
5015                 }
5016
5017                 rec = tl->tl_recs[i];
5018                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5019                                                     le32_to_cpu(rec.t_start));
5020                 num_clusters = le32_to_cpu(rec.t_clusters);
5021
5022                 /* if start_blk is not set, we ignore the record as
5023                  * invalid. */
5024                 if (start_blk) {
5025                         mlog(0, "free record %d, start = %u, clusters = %u\n",
5026                              i, le32_to_cpu(rec.t_start), num_clusters);
5027
5028                         status = ocfs2_free_clusters(handle, data_alloc_inode,
5029                                                      data_alloc_bh, start_blk,
5030                                                      num_clusters);
5031                         if (status < 0) {
5032                                 mlog_errno(status);
5033                                 goto bail;
5034                         }
5035                 }
5036                 i--;
5037         }
5038
5039 bail:
5040         mlog_exit(status);
5041         return status;
5042 }
5043
5044 /* Expects you to already be holding tl_inode->i_mutex */
5045 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5046 {
5047         int status;
5048         unsigned int num_to_flush;
5049         handle_t *handle;
5050         struct inode *tl_inode = osb->osb_tl_inode;
5051         struct inode *data_alloc_inode = NULL;
5052         struct buffer_head *tl_bh = osb->osb_tl_bh;
5053         struct buffer_head *data_alloc_bh = NULL;
5054         struct ocfs2_dinode *di;
5055         struct ocfs2_truncate_log *tl;
5056
5057         mlog_entry_void();
5058
5059         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5060
5061         di = (struct ocfs2_dinode *) tl_bh->b_data;
5062         tl = &di->id2.i_dealloc;
5063         if (!OCFS2_IS_VALID_DINODE(di)) {
5064                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5065                 status = -EIO;
5066                 goto out;
5067         }
5068
5069         num_to_flush = le16_to_cpu(tl->tl_used);
5070         mlog(0, "Flush %u records from truncate log #%llu\n",
5071              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5072         if (!num_to_flush) {
5073                 status = 0;
5074                 goto out;
5075         }
5076
5077         data_alloc_inode = ocfs2_get_system_file_inode(osb,
5078                                                        GLOBAL_BITMAP_SYSTEM_INODE,
5079                                                        OCFS2_INVALID_SLOT);
5080         if (!data_alloc_inode) {
5081                 status = -EINVAL;
5082                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
5083                 goto out;
5084         }
5085
5086         mutex_lock(&data_alloc_inode->i_mutex);
5087
5088         status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5089         if (status < 0) {
5090                 mlog_errno(status);
5091                 goto out_mutex;
5092         }
5093
5094         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5095         if (IS_ERR(handle)) {
5096                 status = PTR_ERR(handle);
5097                 mlog_errno(status);
5098                 goto out_unlock;
5099         }
5100
5101         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5102                                                data_alloc_bh);
5103         if (status < 0)
5104                 mlog_errno(status);
5105
5106         ocfs2_commit_trans(osb, handle);
5107
5108 out_unlock:
5109         brelse(data_alloc_bh);
5110         ocfs2_inode_unlock(data_alloc_inode, 1);
5111
5112 out_mutex:
5113         mutex_unlock(&data_alloc_inode->i_mutex);
5114         iput(data_alloc_inode);
5115
5116 out:
5117         mlog_exit(status);
5118         return status;
5119 }
5120
5121 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5122 {
5123         int status;
5124         struct inode *tl_inode = osb->osb_tl_inode;
5125
5126         mutex_lock(&tl_inode->i_mutex);
5127         status = __ocfs2_flush_truncate_log(osb);
5128         mutex_unlock(&tl_inode->i_mutex);
5129
5130         return status;
5131 }
5132
5133 static void ocfs2_truncate_log_worker(struct work_struct *work)
5134 {
5135         int status;
5136         struct ocfs2_super *osb =
5137                 container_of(work, struct ocfs2_super,
5138                              osb_truncate_log_wq.work);
5139
5140         mlog_entry_void();
5141
5142         status = ocfs2_flush_truncate_log(osb);
5143         if (status < 0)
5144                 mlog_errno(status);
5145         else
5146                 ocfs2_init_inode_steal_slot(osb);
5147
5148         mlog_exit(status);
5149 }
5150
5151 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5152 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5153                                        int cancel)
5154 {
5155         if (osb->osb_tl_inode) {
5156                 /* We want to push off log flushes while truncates are
5157                  * still running. */
5158                 if (cancel)
5159                         cancel_delayed_work(&osb->osb_truncate_log_wq);
5160
5161                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5162                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5163         }
5164 }
5165
5166 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5167                                        int slot_num,
5168                                        struct inode **tl_inode,
5169                                        struct buffer_head **tl_bh)
5170 {
5171         int status;
5172         struct inode *inode = NULL;
5173         struct buffer_head *bh = NULL;
5174
5175         inode = ocfs2_get_system_file_inode(osb,
5176                                            TRUNCATE_LOG_SYSTEM_INODE,
5177                                            slot_num);
5178         if (!inode) {
5179                 status = -EINVAL;
5180                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5181                 goto bail;
5182         }
5183
5184         status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
5185                                   OCFS2_BH_CACHED, inode);
5186         if (status < 0) {
5187                 iput(inode);
5188                 mlog_errno(status);
5189                 goto bail;
5190         }
5191
5192         *tl_inode = inode;
5193         *tl_bh    = bh;
5194 bail:
5195         mlog_exit(status);
5196         return status;
5197 }
5198
5199 /* called during the 1st stage of node recovery. we stamp a clean
5200  * truncate log and pass back a copy for processing later. if the
5201  * truncate log does not require processing, a *tl_copy is set to
5202  * NULL. */
5203 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5204                                       int slot_num,
5205                                       struct ocfs2_dinode **tl_copy)
5206 {
5207         int status;
5208         struct inode *tl_inode = NULL;
5209         struct buffer_head *tl_bh = NULL;
5210         struct ocfs2_dinode *di;
5211         struct ocfs2_truncate_log *tl;
5212
5213         *tl_copy = NULL;
5214
5215         mlog(0, "recover truncate log from slot %d\n", slot_num);
5216
5217         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5218         if (status < 0) {
5219                 mlog_errno(status);
5220                 goto bail;
5221         }
5222
5223         di = (struct ocfs2_dinode *) tl_bh->b_data;
5224         tl = &di->id2.i_dealloc;
5225         if (!OCFS2_IS_VALID_DINODE(di)) {
5226                 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
5227                 status = -EIO;
5228                 goto bail;
5229         }
5230
5231         if (le16_to_cpu(tl->tl_used)) {
5232                 mlog(0, "We'll have %u logs to recover\n",
5233                      le16_to_cpu(tl->tl_used));
5234
5235                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5236                 if (!(*tl_copy)) {
5237                         status = -ENOMEM;
5238                         mlog_errno(status);
5239                         goto bail;
5240                 }
5241
5242                 /* Assuming the write-out below goes well, this copy
5243                  * will be passed back to recovery for processing. */
5244                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5245
5246                 /* All we need to do to clear the truncate log is set
5247                  * tl_used. */
5248                 tl->tl_used = 0;
5249
5250                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
5251                 if (status < 0) {
5252                         mlog_errno(status);
5253                         goto bail;
5254                 }
5255         }
5256
5257 bail:
5258         if (tl_inode)
5259                 iput(tl_inode);
5260         if (tl_bh)
5261                 brelse(tl_bh);
5262
5263         if (status < 0 && (*tl_copy)) {
5264                 kfree(*tl_copy);
5265                 *tl_copy = NULL;
5266         }
5267
5268         mlog_exit(status);
5269         return status;
5270 }
5271
5272 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5273                                          struct ocfs2_dinode *tl_copy)
5274 {
5275         int status = 0;
5276         int i;
5277         unsigned int clusters, num_recs, start_cluster;
5278         u64 start_blk;
5279         handle_t *handle;
5280         struct inode *tl_inode = osb->osb_tl_inode;
5281         struct ocfs2_truncate_log *tl;
5282
5283         mlog_entry_void();
5284
5285         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5286                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5287                 return -EINVAL;
5288         }
5289
5290         tl = &tl_copy->id2.i_dealloc;
5291         num_recs = le16_to_cpu(tl->tl_used);
5292         mlog(0, "cleanup %u records from %llu\n", num_recs,
5293              (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5294
5295         mutex_lock(&tl_inode->i_mutex);
5296         for(i = 0; i < num_recs; i++) {
5297                 if (ocfs2_truncate_log_needs_flush(osb)) {
5298                         status = __ocfs2_flush_truncate_log(osb);
5299                         if (status < 0) {
5300                                 mlog_errno(status);
5301                                 goto bail_up;
5302                         }
5303                 }
5304
5305                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5306                 if (IS_ERR(handle)) {
5307                         status = PTR_ERR(handle);
5308                         mlog_errno(status);
5309                         goto bail_up;
5310                 }
5311
5312                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5313                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5314                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5315
5316                 status = ocfs2_truncate_log_append(osb, handle,
5317                                                    start_blk, clusters);
5318                 ocfs2_commit_trans(osb, handle);
5319                 if (status < 0) {
5320                         mlog_errno(status);
5321                         goto bail_up;
5322                 }
5323         }
5324
5325 bail_up:
5326         mutex_unlock(&tl_inode->i_mutex);
5327
5328         mlog_exit(status);
5329         return status;
5330 }
5331
5332 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5333 {
5334         int status;
5335         struct inode *tl_inode = osb->osb_tl_inode;
5336
5337         mlog_entry_void();
5338
5339         if (tl_inode) {
5340                 cancel_delayed_work(&osb->osb_truncate_log_wq);
5341                 flush_workqueue(ocfs2_wq);
5342
5343                 status = ocfs2_flush_truncate_log(osb);
5344                 if (status < 0)
5345                         mlog_errno(status);
5346
5347                 brelse(osb->osb_tl_bh);
5348                 iput(osb->osb_tl_inode);
5349         }
5350
5351         mlog_exit_void();
5352 }
5353
5354 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5355 {
5356         int status;
5357         struct inode *tl_inode = NULL;
5358         struct buffer_head *tl_bh = NULL;
5359
5360         mlog_entry_void();
5361
5362         status = ocfs2_get_truncate_log_info(osb,
5363                                              osb->slot_num,
5364                                              &tl_inode,
5365                                              &tl_bh);
5366         if (status < 0)
5367                 mlog_errno(status);
5368
5369         /* ocfs2_truncate_log_shutdown keys on the existence of
5370          * osb->osb_tl_inode so we don't set any of the osb variables
5371          * until we're sure all is well. */
5372         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5373                           ocfs2_truncate_log_worker);
5374         osb->osb_tl_bh    = tl_bh;
5375         osb->osb_tl_inode = tl_inode;
5376
5377         mlog_exit(status);
5378         return status;
5379 }
5380
5381 /*
5382  * Delayed de-allocation of suballocator blocks.
5383  *
5384  * Some sets of block de-allocations might involve multiple suballocator inodes.
5385  *
5386  * The locking for this can get extremely complicated, especially when
5387  * the suballocator inodes to delete from aren't known until deep
5388  * within an unrelated codepath.
5389  *
5390  * ocfs2_extent_block structures are a good example of this - an inode
5391  * btree could have been grown by any number of nodes each allocating
5392  * out of their own suballoc inode.
5393  *
5394  * These structures allow the delay of block de-allocation until a
5395  * later time, when locking of multiple cluster inodes won't cause
5396  * deadlock.
5397  */
5398
5399 /*
5400  * Describes a single block free from a suballocator
5401  */
5402 struct ocfs2_cached_block_free {
5403         struct ocfs2_cached_block_free          *free_next;
5404         u64                                     free_blk;
5405         unsigned int                            free_bit;
5406 };
5407
5408 struct ocfs2_per_slot_free_list {
5409         struct ocfs2_per_slot_free_list         *f_next_suballocator;
5410         int                                     f_inode_type;
5411         int                                     f_slot;
5412         struct ocfs2_cached_block_free          *f_first;
5413 };
5414
5415 static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5416                                    int sysfile_type,
5417                                    int slot,
5418                                    struct ocfs2_cached_block_free *head)
5419 {
5420         int ret;
5421         u64 bg_blkno;
5422         handle_t *handle;
5423         struct inode *inode;
5424         struct buffer_head *di_bh = NULL;
5425         struct ocfs2_cached_block_free *tmp;
5426
5427         inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5428         if (!inode) {
5429                 ret = -EINVAL;
5430                 mlog_errno(ret);
5431                 goto out;
5432         }
5433
5434         mutex_lock(&inode->i_mutex);
5435
5436         ret = ocfs2_inode_lock(inode, &di_bh, 1);
5437         if (ret) {
5438                 mlog_errno(ret);
5439                 goto out_mutex;
5440         }
5441
5442         handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5443         if (IS_ERR(handle)) {
5444                 ret = PTR_ERR(handle);
5445                 mlog_errno(ret);
5446                 goto out_unlock;
5447         }
5448
5449         while (head) {
5450                 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5451                                                       head->free_bit);
5452                 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5453                      head->free_bit, (unsigned long long)head->free_blk);
5454
5455                 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5456                                                head->free_bit, bg_blkno, 1);
5457                 if (ret) {
5458                         mlog_errno(ret);
5459                         goto out_journal;
5460                 }
5461
5462                 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5463                 if (ret) {
5464                         mlog_errno(ret);
5465                         goto out_journal;
5466                 }
5467
5468                 tmp = head;
5469                 head = head->free_next;
5470                 kfree(tmp);
5471         }
5472
5473 out_journal:
5474         ocfs2_commit_trans(osb, handle);
5475
5476 out_unlock:
5477         ocfs2_inode_unlock(inode, 1);
5478         brelse(di_bh);
5479 out_mutex:
5480         mutex_unlock(&inode->i_mutex);
5481         iput(inode);
5482 out:
5483         while(head) {
5484                 /* Premature exit may have left some dangling items. */
5485                 tmp = head;
5486                 head = head->free_next;
5487                 kfree(tmp);
5488         }
5489
5490         return ret;
5491 }
5492
5493 int ocfs2_run_deallocs(struct ocfs2_super *osb,
5494                        struct ocfs2_cached_dealloc_ctxt *ctxt)
5495 {
5496         int ret = 0, ret2;
5497         struct ocfs2_per_slot_free_list *fl;
5498
5499         if (!ctxt)
5500                 return 0;
5501
5502         while (ctxt->c_first_suballocator) {
5503                 fl = ctxt->c_first_suballocator;
5504
5505                 if (fl->f_first) {
5506                         mlog(0, "Free items: (type %u, slot %d)\n",
5507                              fl->f_inode_type, fl->f_slot);
5508                         ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
5509                                                        fl->f_slot, fl->f_first);
5510                         if (ret2)
5511                                 mlog_errno(ret2);
5512                         if (!ret)
5513                                 ret = ret2;
5514                 }
5515
5516                 ctxt->c_first_suballocator = fl->f_next_suballocator;
5517                 kfree(fl);
5518         }
5519
5520         return ret;
5521 }
5522
5523 static struct ocfs2_per_slot_free_list *
5524 ocfs2_find_per_slot_free_list(int type,
5525                               int slot,
5526                               struct ocfs2_cached_dealloc_ctxt *ctxt)
5527 {
5528         struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
5529
5530         while (fl) {
5531                 if (fl->f_inode_type == type && fl->f_slot == slot)
5532                         return fl;
5533
5534                 fl = fl->f_next_suballocator;
5535         }
5536
5537         fl = kmalloc(sizeof(*fl), GFP_NOFS);
5538         if (fl) {
5539                 fl->f_inode_type = type;
5540                 fl->f_slot = slot;
5541                 fl->f_first = NULL;
5542                 fl->f_next_suballocator = ctxt->c_first_suballocator;
5543
5544                 ctxt->c_first_suballocator = fl;
5545         }
5546         return fl;
5547 }
5548
5549 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
5550                                      int type, int slot, u64 blkno,
5551                                      unsigned int bit)
5552 {
5553         int ret;
5554         struct ocfs2_per_slot_free_list *fl;
5555         struct ocfs2_cached_block_free *item;
5556
5557         fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
5558         if (fl == NULL) {
5559                 ret = -ENOMEM;
5560                 mlog_errno(ret);
5561                 goto out;
5562         }
5563
5564         item = kmalloc(sizeof(*item), GFP_NOFS);
5565         if (item == NULL) {
5566                 ret = -ENOMEM;
5567                 mlog_errno(ret);
5568                 goto out;
5569         }
5570
5571         mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
5572              type, slot, bit, (unsigned long long)blkno);
5573
5574         item->free_blk = blkno;
5575         item->free_bit = bit;
5576         item->free_next = fl->f_first;
5577
5578         fl->f_first = item;
5579
5580         ret = 0;
5581 out:
5582         return ret;
5583 }
5584
5585 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
5586                                          struct ocfs2_extent_block *eb)
5587 {
5588         return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
5589                                          le16_to_cpu(eb->h_suballoc_slot),
5590                                          le64_to_cpu(eb->h_blkno),
5591                                          le16_to_cpu(eb->h_suballoc_bit));
5592 }
5593
5594 /* This function will figure out whether the currently last extent
5595  * block will be deleted, and if it will, what the new last extent
5596  * block will be so we can update his h_next_leaf_blk field, as well
5597  * as the dinodes i_last_eb_blk */
5598 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
5599                                        unsigned int clusters_to_del,
5600                                        struct ocfs2_path *path,
5601                                        struct buffer_head **new_last_eb)
5602 {
5603         int next_free, ret = 0;
5604         u32 cpos;
5605         struct ocfs2_extent_rec *rec;
5606         struct ocfs2_extent_block *eb;
5607         struct ocfs2_extent_list *el;
5608         struct buffer_head *bh = NULL;
5609
5610         *new_last_eb = NULL;
5611
5612         /* we have no tree, so of course, no last_eb. */
5613         if (!path->p_tree_depth)
5614                 goto out;
5615
5616         /* trunc to zero special case - this makes tree_depth = 0
5617          * regardless of what it is.  */
5618         if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
5619                 goto out;
5620
5621         el = path_leaf_el(path);
5622         BUG_ON(!el->l_next_free_rec);
5623
5624         /*
5625          * Make sure that this extent list will actually be empty
5626          * after we clear away the data. We can shortcut out if
5627          * there's more than one non-empty extent in the
5628          * list. Otherwise, a check of the remaining extent is
5629          * necessary.
5630          */
5631         next_free = le16_to_cpu(el->l_next_free_rec);
5632         rec = NULL;
5633         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
5634                 if (next_free > 2)
5635                         goto out;
5636
5637                 /* We may have a valid extent in index 1, check it. */
5638                 if (next_free == 2)
5639                         rec = &el->l_recs[1];
5640
5641                 /*
5642                  * Fall through - no more nonempty extents, so we want
5643                  * to delete this leaf.
5644                  */
5645         } else {
5646                 if (next_free > 1)
5647                         goto out;
5648
5649                 rec = &el->l_recs[0];
5650         }
5651
5652         if (rec) {
5653                 /*
5654                  * Check it we'll only be trimming off the end of this
5655                  * cluster.
5656                  */
5657                 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
5658                         goto out;
5659         }
5660
5661         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
5662         if (ret) {
5663                 mlog_errno(ret);
5664                 goto out;
5665         }
5666
5667         ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
5668         if (ret) {
5669                 mlog_errno(ret);
5670                 goto out;
5671         }
5672
5673         eb = (struct ocfs2_extent_block *) bh->b_data;
5674         el = &eb->h_list;
5675         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
5676                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
5677                 ret = -EROFS;
5678                 goto out;
5679         }
5680
5681         *new_last_eb = bh;
5682         get_bh(*new_last_eb);
5683         mlog(0, "returning block %llu, (cpos: %u)\n",
5684              (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
5685 out:
5686         brelse(bh);
5687
5688         return ret;
5689 }
5690
5691 /*
5692  * Trim some clusters off the rightmost edge of a tree. Only called
5693  * during truncate.
5694  *
5695  * The caller needs to:
5696  *   - start journaling of each path component.
5697  *   - compute and fully set up any new last ext block
5698  */
5699 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
5700                            handle_t *handle, struct ocfs2_truncate_context *tc,
5701                            u32 clusters_to_del, u64 *delete_start)
5702 {
5703         int ret, i, index = path->p_tree_depth;
5704         u32 new_edge = 0;
5705         u64 deleted_eb = 0;
5706         struct buffer_head *bh;
5707         struct ocfs2_extent_list *el;
5708         struct ocfs2_extent_rec *rec;
5709
5710         *delete_start = 0;
5711
5712         while (index >= 0) {
5713                 bh = path->p_node[index].bh;
5714                 el = path->p_node[index].el;
5715
5716                 mlog(0, "traveling tree (index = %d, block = %llu)\n",
5717                      index,  (unsigned long long)bh->b_blocknr);
5718
5719                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
5720
5721                 if (index !=
5722                     (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
5723                         ocfs2_error(inode->i_sb,
5724                                     "Inode %lu has invalid ext. block %llu",
5725                                     inode->i_ino,
5726                                     (unsigned long long)bh->b_blocknr);
5727                         ret = -EROFS;
5728                         goto out;
5729                 }
5730
5731 find_tail_record:
5732                 i = le16_to_cpu(el->l_next_free_rec) - 1;
5733                 rec = &el->l_recs[i];
5734
5735                 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
5736                      "next = %u\n", i, le32_to_cpu(rec->e_cpos),
5737                      ocfs2_rec_clusters(el, rec),
5738                      (unsigned long long)le64_to_cpu(rec->e_blkno),
5739                      le16_to_cpu(el->l_next_free_rec));
5740
5741                 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
5742
5743                 if (le16_to_cpu(el->l_tree_depth) == 0) {
5744                         /*
5745                          * If the leaf block contains a single empty
5746                          * extent and no records, we can just remove
5747                          * the block.
5748                          */
5749                         if (i == 0 && ocfs2_is_empty_extent(rec)) {
5750                                 memset(rec, 0,
5751                                        sizeof(struct ocfs2_extent_rec));
5752                                 el->l_next_free_rec = cpu_to_le16(0);
5753
5754                                 goto delete;
5755                         }
5756
5757                         /*
5758                          * Remove any empty extents by shifting things
5759                          * left. That should make life much easier on
5760                          * the code below. This condition is rare
5761                          * enough that we shouldn't see a performance
5762                          * hit.
5763                          */
5764                         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
5765                                 le16_add_cpu(&el->l_next_free_rec, -1);
5766
5767                                 for(i = 0;
5768                                     i < le16_to_cpu(el->l_next_free_rec); i++)
5769                                         el->l_recs[i] = el->l_recs[i + 1];
5770
5771                                 memset(&el->l_recs[i], 0,
5772                                        sizeof(struct ocfs2_extent_rec));
5773
5774                                 /*
5775                                  * We've modified our extent list. The
5776                                  * simplest way to handle this change
5777                                  * is to being the search from the
5778                                  * start again.
5779                                  */
5780                                 goto find_tail_record;
5781                         }
5782
5783                         le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
5784
5785                         /*
5786                          * We'll use "new_edge" on our way back up the
5787                          * tree to know what our rightmost cpos is.
5788                          */
5789                         new_edge = le16_to_cpu(rec->e_leaf_clusters);
5790                         new_edge += le32_to_cpu(rec->e_cpos);
5791
5792                         /*
5793                          * The caller will use this to delete data blocks.
5794                          */
5795                         *delete_start = le64_to_cpu(rec->e_blkno)
5796                                 + ocfs2_clusters_to_blocks(inode->i_sb,
5797                                         le16_to_cpu(rec->e_leaf_clusters));
5798
5799                         /*
5800                          * If it's now empty, remove this record.
5801                          */
5802                         if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
5803                                 memset(rec, 0,
5804                                        sizeof(struct ocfs2_extent_rec));
5805                                 le16_add_cpu(&el->l_next_free_rec, -1);
5806                         }
5807                 } else {
5808                         if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
5809                                 memset(rec, 0,
5810                                        sizeof(struct ocfs2_extent_rec));
5811                                 le16_add_cpu(&el->l_next_free_rec, -1);
5812
5813                                 goto delete;
5814                         }
5815
5816                         /* Can this actually happen? */
5817                         if (le16_to_cpu(el->l_next_free_rec) == 0)
5818                                 goto delete;
5819
5820                         /*
5821                          * We never actually deleted any clusters
5822                          * because our leaf was empty. There's no
5823                          * reason to adjust the rightmost edge then.
5824                          */
5825                         if (new_edge == 0)
5826                                 goto delete;
5827
5828                         rec->e_int_clusters = cpu_to_le32(new_edge);
5829                         le32_add_cpu(&rec->e_int_clusters,
5830                                      -le32_to_cpu(rec->e_cpos));
5831
5832                          /*
5833                           * A deleted child record should have been
5834                           * caught above.
5835                           */
5836                          BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
5837                 }
5838
5839 delete:
5840                 ret = ocfs2_journal_dirty(handle, bh);
5841                 if (ret) {
5842                         mlog_errno(ret);
5843                         goto out;
5844                 }
5845
5846                 mlog(0, "extent list container %llu, after: record %d: "
5847                      "(%u, %u, %llu), next = %u.\n",
5848                      (unsigned long long)bh->b_blocknr, i,
5849                      le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
5850                      (unsigned long long)le64_to_cpu(rec->e_blkno),
5851                      le16_to_cpu(el->l_next_free_rec));
5852
5853                 /*
5854                  * We must be careful to only attempt delete of an
5855                  * extent block (and not the root inode block).
5856                  */
5857                 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
5858                         struct ocfs2_extent_block *eb =
5859                                 (struct ocfs2_extent_block *)bh->b_data;
5860
5861                         /*
5862                          * Save this for use when processing the
5863                          * parent block.
5864                          */
5865                         deleted_eb = le64_to_cpu(eb->h_blkno);
5866
5867                         mlog(0, "deleting this extent block.\n");
5868
5869                         ocfs2_remove_from_cache(inode, bh);
5870
5871                         BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
5872                         BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
5873                         BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
5874
5875                         ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
5876                         /* An error here is not fatal. */
5877                         if (ret < 0)
5878                                 mlog_errno(ret);
5879                 } else {
5880                         deleted_eb = 0;
5881                 }
5882
5883                 index--;
5884         }
5885
5886         ret = 0;
5887 out:
5888         return ret;
5889 }
5890
5891 static int ocfs2_do_truncate(struct ocfs2_super *osb,
5892                              unsigned int clusters_to_del,
5893                              struct inode *inode,
5894                              struct buffer_head *fe_bh,
5895                              handle_t *handle,
5896                              struct ocfs2_truncate_context *tc,
5897                              struct ocfs2_path *path)
5898 {
5899         int status;
5900         struct ocfs2_dinode *fe;
5901         struct ocfs2_extent_block *last_eb = NULL;
5902         struct ocfs2_extent_list *el;
5903         struct buffer_head *last_eb_bh = NULL;
5904         u64 delete_blk = 0;
5905
5906         fe = (struct ocfs2_dinode *) fe_bh->b_data;
5907
5908         status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
5909                                              path, &last_eb_bh);
5910         if (status < 0) {
5911                 mlog_errno(status);
5912                 goto bail;
5913         }
5914
5915         /*
5916          * Each component will be touched, so we might as well journal
5917          * here to avoid having to handle errors later.
5918          */
5919         status = ocfs2_journal_access_path(inode, handle, path);
5920         if (status < 0) {
5921                 mlog_errno(status);
5922                 goto bail;
5923         }
5924
5925         if (last_eb_bh) {
5926                 status = ocfs2_journal_access(handle, inode, last_eb_bh,
5927                                               OCFS2_JOURNAL_ACCESS_WRITE);
5928                 if (status < 0) {
5929                         mlog_errno(status);
5930                         goto bail;
5931                 }
5932
5933                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5934         }
5935
5936         el = &(fe->id2.i_list);
5937
5938         /*
5939          * Lower levels depend on this never happening, but it's best
5940          * to check it up here before changing the tree.
5941          */
5942         if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
5943                 ocfs2_error(inode->i_sb,
5944                             "Inode %lu has an empty extent record, depth %u\n",
5945                             inode->i_ino, le16_to_cpu(el->l_tree_depth));
5946                 status = -EROFS;
5947                 goto bail;
5948         }
5949
5950         spin_lock(&OCFS2_I(inode)->ip_lock);
5951         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
5952                                       clusters_to_del;
5953         spin_unlock(&OCFS2_I(inode)->ip_lock);
5954         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
5955         inode->i_blocks = ocfs2_inode_sector_count(inode);
5956
5957         status = ocfs2_trim_tree(inode, path, handle, tc,
5958                                  clusters_to_del, &delete_blk);
5959         if (status) {
5960                 mlog_errno(status);
5961                 goto bail;
5962         }
5963
5964         if (le32_to_cpu(fe->i_clusters) == 0) {
5965                 /* trunc to zero is a special case. */
5966                 el->l_tree_depth = 0;
5967                 fe->i_last_eb_blk = 0;
5968         } else if (last_eb)
5969                 fe->i_last_eb_blk = last_eb->h_blkno;
5970
5971         status = ocfs2_journal_dirty(handle, fe_bh);
5972         if (status < 0) {
5973                 mlog_errno(status);
5974                 goto bail;
5975         }
5976
5977         if (last_eb) {
5978                 /* If there will be a new last extent block, then by
5979                  * definition, there cannot be any leaves to the right of
5980                  * him. */
5981                 last_eb->h_next_leaf_blk = 0;
5982                 status = ocfs2_journal_dirty(handle, last_eb_bh);
5983                 if (status < 0) {
5984                         mlog_errno(status);
5985                         goto bail;
5986                 }
5987         }
5988
5989         if (delete_blk) {
5990                 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
5991                                                    clusters_to_del);
5992                 if (status < 0) {
5993                         mlog_errno(status);
5994                         goto bail;
5995                 }
5996         }
5997         status = 0;
5998 bail:
5999
6000         mlog_exit(status);
6001         return status;
6002 }
6003
6004 static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
6005 {
6006         set_buffer_uptodate(bh);
6007         mark_buffer_dirty(bh);
6008         return 0;
6009 }
6010
6011 static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
6012 {
6013         set_buffer_uptodate(bh);
6014         mark_buffer_dirty(bh);
6015         return ocfs2_journal_dirty_data(handle, bh);
6016 }
6017
6018 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6019                                      unsigned int from, unsigned int to,
6020                                      struct page *page, int zero, u64 *phys)
6021 {
6022         int ret, partial = 0;
6023
6024         ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6025         if (ret)
6026                 mlog_errno(ret);
6027
6028         if (zero)
6029                 zero_user_segment(page, from, to);
6030
6031         /*
6032          * Need to set the buffers we zero'd into uptodate
6033          * here if they aren't - ocfs2_map_page_blocks()
6034          * might've skipped some
6035          */
6036         if (ocfs2_should_order_data(inode)) {
6037                 ret = walk_page_buffers(handle,
6038                                         page_buffers(page),
6039                                         from, to, &partial,
6040                                         ocfs2_ordered_zero_func);
6041                 if (ret < 0)
6042                         mlog_errno(ret);
6043         } else {
6044                 ret = walk_page_buffers(handle, page_buffers(page),
6045                                         from, to, &partial,
6046                                         ocfs2_writeback_zero_func);
6047                 if (ret < 0)
6048                         mlog_errno(ret);
6049         }
6050
6051         if (!partial)
6052                 SetPageUptodate(page);
6053
6054         flush_dcache_page(page);
6055 }
6056
6057 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6058                                      loff_t end, struct page **pages,
6059                                      int numpages, u64 phys, handle_t *handle)
6060 {
6061         int i;
6062         struct page *page;
6063         unsigned int from, to = PAGE_CACHE_SIZE;
6064         struct super_block *sb = inode->i_sb;
6065
6066         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6067
6068         if (numpages == 0)
6069                 goto out;
6070
6071         to = PAGE_CACHE_SIZE;
6072         for(i = 0; i < numpages; i++) {
6073                 page = pages[i];
6074
6075                 from = start & (PAGE_CACHE_SIZE - 1);
6076                 if ((end >> PAGE_CACHE_SHIFT) == page->index)
6077                         to = end & (PAGE_CACHE_SIZE - 1);
6078
6079                 BUG_ON(from > PAGE_CACHE_SIZE);
6080                 BUG_ON(to > PAGE_CACHE_SIZE);
6081
6082                 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6083                                          &phys);
6084
6085                 start = (page->index + 1) << PAGE_CACHE_SHIFT;
6086         }
6087 out:
6088         if (pages)
6089                 ocfs2_unlock_and_free_pages(pages, numpages);
6090 }
6091
6092 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6093                                 struct page **pages, int *num)
6094 {
6095         int numpages, ret = 0;
6096         struct super_block *sb = inode->i_sb;
6097         struct address_space *mapping = inode->i_mapping;
6098         unsigned long index;
6099         loff_t last_page_bytes;
6100
6101         BUG_ON(start > end);
6102
6103         BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6104                (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6105
6106         numpages = 0;
6107         last_page_bytes = PAGE_ALIGN(end);
6108         index = start >> PAGE_CACHE_SHIFT;
6109         do {
6110                 pages[numpages] = grab_cache_page(mapping, index);
6111                 if (!pages[numpages]) {
6112                         ret = -ENOMEM;
6113                         mlog_errno(ret);
6114                         goto out;
6115                 }
6116
6117                 numpages++;
6118                 index++;
6119         } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6120
6121 out:
6122         if (ret != 0) {
6123                 if (pages)
6124                         ocfs2_unlock_and_free_pages(pages, numpages);
6125                 numpages = 0;
6126         }
6127
6128         *num = numpages;
6129
6130         return ret;
6131 }
6132
6133 /*
6134  * Zero the area past i_size but still within an allocated
6135  * cluster. This avoids exposing nonzero data on subsequent file
6136  * extends.
6137  *
6138  * We need to call this before i_size is updated on the inode because
6139  * otherwise block_write_full_page() will skip writeout of pages past
6140  * i_size. The new_i_size parameter is passed for this reason.
6141  */
6142 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6143                                   u64 range_start, u64 range_end)
6144 {
6145         int ret = 0, numpages;
6146         struct page **pages = NULL;
6147         u64 phys;
6148         unsigned int ext_flags;
6149         struct super_block *sb = inode->i_sb;
6150
6151         /*
6152          * File systems which don't support sparse files zero on every
6153          * extend.
6154          */
6155         if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6156                 return 0;
6157
6158         pages = kcalloc(ocfs2_pages_per_cluster(sb),
6159                         sizeof(struct page *), GFP_NOFS);
6160         if (pages == NULL) {
6161                 ret = -ENOMEM;
6162                 mlog_errno(ret);
6163                 goto out;
6164         }
6165
6166         if (range_start == range_end)
6167                 goto out;
6168
6169         ret = ocfs2_extent_map_get_blocks(inode,
6170                                           range_start >> sb->s_blocksize_bits,
6171                                           &phys, NULL, &ext_flags);
6172         if (ret) {
6173                 mlog_errno(ret);
6174                 goto out;
6175         }
6176
6177         /*
6178          * Tail is a hole, or is marked unwritten. In either case, we
6179          * can count on read and write to return/push zero's.
6180          */
6181         if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6182                 goto out;
6183
6184         ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6185                                    &numpages);
6186         if (ret) {
6187                 mlog_errno(ret);
6188                 goto out;
6189         }
6190
6191         ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6192                                  numpages, phys, handle);
6193
6194         /*
6195          * Initiate writeout of the pages we zero'd here. We don't
6196          * wait on them - the truncate_inode_pages() call later will
6197          * do that for us.
6198          */
6199         ret = do_sync_mapping_range(inode->i_mapping, range_start,
6200                                     range_end - 1, SYNC_FILE_RANGE_WRITE);
6201         if (ret)
6202                 mlog_errno(ret);
6203
6204 out:
6205         if (pages)
6206                 kfree(pages);
6207
6208         return ret;
6209 }
6210
6211 static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di)
6212 {
6213         unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6214
6215         memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2));
6216 }
6217
6218 void ocfs2_dinode_new_extent_list(struct inode *inode,
6219                                   struct ocfs2_dinode *di)
6220 {
6221         ocfs2_zero_dinode_id2(inode, di);
6222         di->id2.i_list.l_tree_depth = 0;
6223         di->id2.i_list.l_next_free_rec = 0;
6224         di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
6225 }
6226
6227 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6228 {
6229         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6230         struct ocfs2_inline_data *idata = &di->id2.i_data;
6231
6232         spin_lock(&oi->ip_lock);
6233         oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6234         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6235         spin_unlock(&oi->ip_lock);
6236
6237         /*
6238          * We clear the entire i_data structure here so that all
6239          * fields can be properly initialized.
6240          */
6241         ocfs2_zero_dinode_id2(inode, di);
6242
6243         idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb));
6244 }
6245
6246 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6247                                          struct buffer_head *di_bh)
6248 {
6249         int ret, i, has_data, num_pages = 0;
6250         handle_t *handle;
6251         u64 uninitialized_var(block);
6252         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6253         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6254         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6255         struct ocfs2_alloc_context *data_ac = NULL;
6256         struct page **pages = NULL;
6257         loff_t end = osb->s_clustersize;
6258
6259         has_data = i_size_read(inode) ? 1 : 0;
6260
6261         if (has_data) {
6262                 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6263                                 sizeof(struct page *), GFP_NOFS);
6264                 if (pages == NULL) {
6265                         ret = -ENOMEM;
6266                         mlog_errno(ret);
6267                         goto out;
6268                 }
6269
6270                 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6271                 if (ret) {
6272                         mlog_errno(ret);
6273                         goto out;
6274                 }
6275         }
6276
6277         handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
6278         if (IS_ERR(handle)) {
6279                 ret = PTR_ERR(handle);
6280                 mlog_errno(ret);
6281                 goto out_unlock;
6282         }
6283
6284         ret = ocfs2_journal_access(handle, inode, di_bh,
6285                                    OCFS2_JOURNAL_ACCESS_WRITE);
6286         if (ret) {
6287                 mlog_errno(ret);
6288                 goto out_commit;
6289         }
6290
6291         if (has_data) {
6292                 u32 bit_off, num;
6293                 unsigned int page_end;
6294                 u64 phys;
6295
6296                 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6297                                            &num);
6298                 if (ret) {
6299                         mlog_errno(ret);
6300                         goto out_commit;
6301                 }
6302
6303                 /*
6304                  * Save two copies, one for insert, and one that can
6305                  * be changed by ocfs2_map_and_dirty_page() below.
6306                  */
6307                 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6308
6309                 /*
6310                  * Non sparse file systems zero on extend, so no need
6311                  * to do that now.
6312                  */
6313                 if (!ocfs2_sparse_alloc(osb) &&
6314                     PAGE_CACHE_SIZE < osb->s_clustersize)
6315                         end = PAGE_CACHE_SIZE;
6316
6317                 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6318                 if (ret) {
6319                         mlog_errno(ret);
6320                         goto out_commit;
6321                 }
6322
6323                 /*
6324                  * This should populate the 1st page for us and mark
6325                  * it up to date.
6326                  */
6327                 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6328                 if (ret) {
6329                         mlog_errno(ret);
6330                         goto out_commit;
6331                 }
6332
6333                 page_end = PAGE_CACHE_SIZE;
6334                 if (PAGE_CACHE_SIZE > osb->s_clustersize)
6335                         page_end = osb->s_clustersize;
6336
6337                 for (i = 0; i < num_pages; i++)
6338                         ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6339                                                  pages[i], i > 0, &phys);
6340         }
6341
6342         spin_lock(&oi->ip_lock);
6343         oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6344         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6345         spin_unlock(&oi->ip_lock);
6346
6347         ocfs2_dinode_new_extent_list(inode, di);
6348
6349         ocfs2_journal_dirty(handle, di_bh);
6350
6351         if (has_data) {
6352                 /*
6353                  * An error at this point should be extremely rare. If
6354                  * this proves to be false, we could always re-build
6355                  * the in-inode data from our pages.
6356                  */
6357                 ret = ocfs2_insert_extent(osb, handle, inode, di_bh,
6358                                           0, block, 1, 0, NULL);
6359                 if (ret) {
6360                         mlog_errno(ret);
6361                         goto out_commit;
6362                 }
6363
6364                 inode->i_blocks = ocfs2_inode_sector_count(inode);
6365         }
6366
6367 out_commit:
6368         ocfs2_commit_trans(osb, handle);
6369
6370 out_unlock:
6371         if (data_ac)
6372                 ocfs2_free_alloc_context(data_ac);
6373
6374 out:
6375         if (pages) {
6376                 ocfs2_unlock_and_free_pages(pages, num_pages);
6377                 kfree(pages);
6378         }
6379
6380         return ret;
6381 }
6382
6383 /*
6384  * It is expected, that by the time you call this function,
6385  * inode->i_size and fe->i_size have been adjusted.
6386  *
6387  * WARNING: This will kfree the truncate context
6388  */
6389 int ocfs2_commit_truncate(struct ocfs2_super *osb,
6390                           struct inode *inode,
6391                           struct buffer_head *fe_bh,
6392                           struct ocfs2_truncate_context *tc)
6393 {
6394         int status, i, credits, tl_sem = 0;
6395         u32 clusters_to_del, new_highest_cpos, range;
6396         struct ocfs2_extent_list *el;
6397         handle_t *handle = NULL;
6398         struct inode *tl_inode = osb->osb_tl_inode;
6399         struct ocfs2_path *path = NULL;
6400
6401         mlog_entry_void();
6402
6403         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6404                                                      i_size_read(inode));
6405
6406         path = ocfs2_new_inode_path(fe_bh);
6407         if (!path) {
6408                 status = -ENOMEM;
6409                 mlog_errno(status);
6410                 goto bail;
6411         }
6412
6413         ocfs2_extent_map_trunc(inode, new_highest_cpos);
6414
6415 start:
6416         /*
6417          * Check that we still have allocation to delete.
6418          */
6419         if (OCFS2_I(inode)->ip_clusters == 0) {
6420                 status = 0;
6421                 goto bail;
6422         }
6423
6424         /*
6425          * Truncate always works against the rightmost tree branch.
6426          */
6427         status = ocfs2_find_path(inode, path, UINT_MAX);
6428         if (status) {
6429                 mlog_errno(status);
6430                 goto bail;
6431         }
6432
6433         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
6434              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
6435
6436         /*
6437          * By now, el will point to the extent list on the bottom most
6438          * portion of this tree. Only the tail record is considered in
6439          * each pass.
6440          *
6441          * We handle the following cases, in order:
6442          * - empty extent: delete the remaining branch
6443          * - remove the entire record
6444          * - remove a partial record
6445          * - no record needs to be removed (truncate has completed)
6446          */
6447         el = path_leaf_el(path);
6448         if (le16_to_cpu(el->l_next_free_rec) == 0) {
6449                 ocfs2_error(inode->i_sb,
6450                             "Inode %llu has empty extent block at %llu\n",
6451                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
6452                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
6453                 status = -EROFS;
6454                 goto bail;
6455         }
6456
6457         i = le16_to_cpu(el->l_next_free_rec) - 1;
6458         range = le32_to_cpu(el->l_recs[i].e_cpos) +
6459                 ocfs2_rec_clusters(el, &el->l_recs[i]);
6460         if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
6461                 clusters_to_del = 0;
6462         } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
6463                 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
6464         } else if (range > new_highest_cpos) {
6465                 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
6466                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
6467                                   new_highest_cpos;
6468         } else {
6469                 status = 0;
6470                 goto bail;
6471         }
6472
6473         mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
6474              clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
6475
6476         mutex_lock(&tl_inode->i_mutex);
6477         tl_sem = 1;
6478         /* ocfs2_truncate_log_needs_flush guarantees us at least one
6479          * record is free for use. If there isn't any, we flush to get
6480          * an empty truncate log.  */
6481         if (ocfs2_truncate_log_needs_flush(osb)) {
6482                 status = __ocfs2_flush_truncate_log(osb);
6483                 if (status < 0) {
6484                         mlog_errno(status);
6485                         goto bail;
6486                 }
6487         }
6488
6489         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
6490                                                 (struct ocfs2_dinode *)fe_bh->b_data,
6491                                                 el);
6492         handle = ocfs2_start_trans(osb, credits);
6493         if (IS_ERR(handle)) {
6494                 status = PTR_ERR(handle);
6495                 handle = NULL;
6496                 mlog_errno(status);
6497                 goto bail;
6498         }
6499
6500         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
6501                                    tc, path);
6502         if (status < 0) {
6503                 mlog_errno(status);
6504                 goto bail;
6505         }
6506
6507         mutex_unlock(&tl_inode->i_mutex);
6508         tl_sem = 0;
6509
6510         ocfs2_commit_trans(osb, handle);
6511         handle = NULL;
6512
6513         ocfs2_reinit_path(path, 1);
6514
6515         /*
6516          * The check above will catch the case where we've truncated
6517          * away all allocation.
6518          */
6519         goto start;
6520
6521 bail:
6522
6523         ocfs2_schedule_truncate_log_flush(osb, 1);
6524
6525         if (tl_sem)
6526                 mutex_unlock(&tl_inode->i_mutex);
6527
6528         if (handle)
6529                 ocfs2_commit_trans(osb, handle);
6530
6531         ocfs2_run_deallocs(osb, &tc->tc_dealloc);
6532
6533         ocfs2_free_path(path);
6534
6535         /* This will drop the ext_alloc cluster lock for us */
6536         ocfs2_free_truncate_context(tc);
6537
6538         mlog_exit(status);
6539         return status;
6540 }
6541
6542 /*
6543  * Expects the inode to already be locked.
6544  */
6545 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
6546                            struct inode *inode,
6547                            struct buffer_head *fe_bh,
6548                            struct ocfs2_truncate_context **tc)
6549 {
6550         int status;
6551         unsigned int new_i_clusters;
6552         struct ocfs2_dinode *fe;
6553         struct ocfs2_extent_block *eb;
6554         struct buffer_head *last_eb_bh = NULL;
6555
6556         mlog_entry_void();
6557
6558         *tc = NULL;
6559
6560         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
6561                                                   i_size_read(inode));
6562         fe = (struct ocfs2_dinode *) fe_bh->b_data;
6563
6564         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
6565              "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
6566              (unsigned long long)le64_to_cpu(fe->i_size));
6567
6568         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
6569         if (!(*tc)) {
6570                 status = -ENOMEM;
6571                 mlog_errno(status);
6572                 goto bail;
6573         }
6574         ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
6575
6576         if (fe->id2.i_list.l_tree_depth) {
6577                 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
6578                                           &last_eb_bh, OCFS2_BH_CACHED, inode);
6579                 if (status < 0) {
6580                         mlog_errno(status);
6581                         goto bail;
6582                 }
6583                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6584                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6585                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6586
6587                         brelse(last_eb_bh);
6588                         status = -EIO;
6589                         goto bail;
6590                 }
6591         }
6592
6593         (*tc)->tc_last_eb_bh = last_eb_bh;
6594
6595         status = 0;
6596 bail:
6597         if (status < 0) {
6598                 if (*tc)
6599                         ocfs2_free_truncate_context(*tc);
6600                 *tc = NULL;
6601         }
6602         mlog_exit_void();
6603         return status;
6604 }
6605
6606 /*
6607  * 'start' is inclusive, 'end' is not.
6608  */
6609 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
6610                           unsigned int start, unsigned int end, int trunc)
6611 {
6612         int ret;
6613         unsigned int numbytes;
6614         handle_t *handle;
6615         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6616         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6617         struct ocfs2_inline_data *idata = &di->id2.i_data;
6618
6619         if (end > i_size_read(inode))
6620                 end = i_size_read(inode);
6621
6622         BUG_ON(start >= end);
6623
6624         if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
6625             !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
6626             !ocfs2_supports_inline_data(osb)) {
6627                 ocfs2_error(inode->i_sb,
6628                             "Inline data flags for inode %llu don't agree! "
6629                             "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
6630                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
6631                             le16_to_cpu(di->i_dyn_features),
6632                             OCFS2_I(inode)->ip_dyn_features,
6633                             osb->s_feature_incompat);
6634                 ret = -EROFS;
6635                 goto out;
6636         }
6637
6638         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
6639         if (IS_ERR(handle)) {
6640                 ret = PTR_ERR(handle);
6641                 mlog_errno(ret);
6642                 goto out;
6643         }
6644
6645         ret = ocfs2_journal_access(handle, inode, di_bh,
6646                                    OCFS2_JOURNAL_ACCESS_WRITE);
6647         if (ret) {
6648                 mlog_errno(ret);
6649                 goto out_commit;
6650         }
6651
6652         numbytes = end - start;
6653         memset(idata->id_data + start, 0, numbytes);
6654
6655         /*
6656          * No need to worry about the data page here - it's been
6657          * truncated already and inline data doesn't need it for
6658          * pushing zero's to disk, so we'll let readpage pick it up
6659          * later.
6660          */
6661         if (trunc) {
6662                 i_size_write(inode, start);
6663                 di->i_size = cpu_to_le64(start);
6664         }
6665
6666         inode->i_blocks = ocfs2_inode_sector_count(inode);
6667         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
6668
6669         di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
6670         di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
6671
6672         ocfs2_journal_dirty(handle, di_bh);
6673
6674 out_commit:
6675         ocfs2_commit_trans(osb, handle);
6676
6677 out:
6678         return ret;
6679 }
6680
6681 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
6682 {
6683         /*
6684          * The caller is responsible for completing deallocation
6685          * before freeing the context.
6686          */
6687         if (tc->tc_dealloc.c_first_suballocator != NULL)
6688                 mlog(ML_NOTICE,
6689                      "Truncate completion has non-empty dealloc context\n");
6690
6691         if (tc->tc_last_eb_bh)
6692                 brelse(tc->tc_last_eb_bh);
6693
6694         kfree(tc);
6695 }