]> Pileus Git - ~andy/linux/blob - fs/xfs/xfs_da_btree.c
sit: rename rtnl functions for consistency
[~andy/linux] / fs / xfs / xfs_da_btree.c
1 /*
2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_types.h"
21 #include "xfs_bit.h"
22 #include "xfs_log.h"
23 #include "xfs_trans.h"
24 #include "xfs_sb.h"
25 #include "xfs_ag.h"
26 #include "xfs_mount.h"
27 #include "xfs_da_btree.h"
28 #include "xfs_bmap_btree.h"
29 #include "xfs_dir2.h"
30 #include "xfs_dir2_format.h"
31 #include "xfs_dir2_priv.h"
32 #include "xfs_dinode.h"
33 #include "xfs_inode.h"
34 #include "xfs_inode_item.h"
35 #include "xfs_alloc.h"
36 #include "xfs_bmap.h"
37 #include "xfs_attr.h"
38 #include "xfs_attr_leaf.h"
39 #include "xfs_error.h"
40 #include "xfs_trace.h"
41
42 /*
43  * xfs_da_btree.c
44  *
45  * Routines to implement directories as Btrees of hashed names.
46  */
47
48 /*========================================================================
49  * Function prototypes for the kernel.
50  *========================================================================*/
51
52 /*
53  * Routines used for growing the Btree.
54  */
55 STATIC int xfs_da_root_split(xfs_da_state_t *state,
56                                             xfs_da_state_blk_t *existing_root,
57                                             xfs_da_state_blk_t *new_child);
58 STATIC int xfs_da_node_split(xfs_da_state_t *state,
59                                             xfs_da_state_blk_t *existing_blk,
60                                             xfs_da_state_blk_t *split_blk,
61                                             xfs_da_state_blk_t *blk_to_add,
62                                             int treelevel,
63                                             int *result);
64 STATIC void xfs_da_node_rebalance(xfs_da_state_t *state,
65                                          xfs_da_state_blk_t *node_blk_1,
66                                          xfs_da_state_blk_t *node_blk_2);
67 STATIC void xfs_da_node_add(xfs_da_state_t *state,
68                                    xfs_da_state_blk_t *old_node_blk,
69                                    xfs_da_state_blk_t *new_node_blk);
70
71 /*
72  * Routines used for shrinking the Btree.
73  */
74 STATIC int xfs_da_root_join(xfs_da_state_t *state,
75                                            xfs_da_state_blk_t *root_blk);
76 STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval);
77 STATIC void xfs_da_node_remove(xfs_da_state_t *state,
78                                               xfs_da_state_blk_t *drop_blk);
79 STATIC void xfs_da_node_unbalance(xfs_da_state_t *state,
80                                          xfs_da_state_blk_t *src_node_blk,
81                                          xfs_da_state_blk_t *dst_node_blk);
82
83 /*
84  * Utility routines.
85  */
86 STATIC uint     xfs_da_node_lasthash(struct xfs_buf *bp, int *count);
87 STATIC int      xfs_da_node_order(struct xfs_buf *node1_bp,
88                                   struct xfs_buf *node2_bp);
89 STATIC int      xfs_da_blk_unlink(xfs_da_state_t *state,
90                                   xfs_da_state_blk_t *drop_blk,
91                                   xfs_da_state_blk_t *save_blk);
92 STATIC void     xfs_da_state_kill_altpath(xfs_da_state_t *state);
93
94 /*========================================================================
95  * Routines used for growing the Btree.
96  *========================================================================*/
97
98 /*
99  * Create the initial contents of an intermediate node.
100  */
101 int
102 xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
103                                  struct xfs_buf **bpp, int whichfork)
104 {
105         xfs_da_intnode_t *node;
106         struct xfs_buf *bp;
107         int error;
108         xfs_trans_t *tp;
109
110         trace_xfs_da_node_create(args);
111
112         tp = args->trans;
113         error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
114         if (error)
115                 return(error);
116         ASSERT(bp != NULL);
117         node = bp->b_addr;
118         node->hdr.info.forw = 0;
119         node->hdr.info.back = 0;
120         node->hdr.info.magic = cpu_to_be16(XFS_DA_NODE_MAGIC);
121         node->hdr.info.pad = 0;
122         node->hdr.count = 0;
123         node->hdr.level = cpu_to_be16(level);
124
125         xfs_trans_log_buf(tp, bp,
126                 XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
127
128         *bpp = bp;
129         return(0);
130 }
131
132 /*
133  * Split a leaf node, rebalance, then possibly split
134  * intermediate nodes, rebalance, etc.
135  */
136 int                                                     /* error */
137 xfs_da_split(xfs_da_state_t *state)
138 {
139         xfs_da_state_blk_t *oldblk, *newblk, *addblk;
140         xfs_da_intnode_t *node;
141         struct xfs_buf *bp;
142         int max, action, error, i;
143
144         trace_xfs_da_split(state->args);
145
146         /*
147          * Walk back up the tree splitting/inserting/adjusting as necessary.
148          * If we need to insert and there isn't room, split the node, then
149          * decide which fragment to insert the new block from below into.
150          * Note that we may split the root this way, but we need more fixup.
151          */
152         max = state->path.active - 1;
153         ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
154         ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
155                state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
156
157         addblk = &state->path.blk[max];         /* initial dummy value */
158         for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
159                 oldblk = &state->path.blk[i];
160                 newblk = &state->altpath.blk[i];
161
162                 /*
163                  * If a leaf node then
164                  *     Allocate a new leaf node, then rebalance across them.
165                  * else if an intermediate node then
166                  *     We split on the last layer, must we split the node?
167                  */
168                 switch (oldblk->magic) {
169                 case XFS_ATTR_LEAF_MAGIC:
170                         error = xfs_attr_leaf_split(state, oldblk, newblk);
171                         if ((error != 0) && (error != ENOSPC)) {
172                                 return(error);  /* GROT: attr is inconsistent */
173                         }
174                         if (!error) {
175                                 addblk = newblk;
176                                 break;
177                         }
178                         /*
179                          * Entry wouldn't fit, split the leaf again.
180                          */
181                         state->extravalid = 1;
182                         if (state->inleaf) {
183                                 state->extraafter = 0;  /* before newblk */
184                                 trace_xfs_attr_leaf_split_before(state->args);
185                                 error = xfs_attr_leaf_split(state, oldblk,
186                                                             &state->extrablk);
187                         } else {
188                                 state->extraafter = 1;  /* after newblk */
189                                 trace_xfs_attr_leaf_split_after(state->args);
190                                 error = xfs_attr_leaf_split(state, newblk,
191                                                             &state->extrablk);
192                         }
193                         if (error)
194                                 return(error);  /* GROT: attr inconsistent */
195                         addblk = newblk;
196                         break;
197                 case XFS_DIR2_LEAFN_MAGIC:
198                         error = xfs_dir2_leafn_split(state, oldblk, newblk);
199                         if (error)
200                                 return error;
201                         addblk = newblk;
202                         break;
203                 case XFS_DA_NODE_MAGIC:
204                         error = xfs_da_node_split(state, oldblk, newblk, addblk,
205                                                          max - i, &action);
206                         addblk->bp = NULL;
207                         if (error)
208                                 return(error);  /* GROT: dir is inconsistent */
209                         /*
210                          * Record the newly split block for the next time thru?
211                          */
212                         if (action)
213                                 addblk = newblk;
214                         else
215                                 addblk = NULL;
216                         break;
217                 }
218
219                 /*
220                  * Update the btree to show the new hashval for this child.
221                  */
222                 xfs_da_fixhashpath(state, &state->path);
223         }
224         if (!addblk)
225                 return(0);
226
227         /*
228          * Split the root node.
229          */
230         ASSERT(state->path.active == 0);
231         oldblk = &state->path.blk[0];
232         error = xfs_da_root_split(state, oldblk, addblk);
233         if (error) {
234                 addblk->bp = NULL;
235                 return(error);  /* GROT: dir is inconsistent */
236         }
237
238         /*
239          * Update pointers to the node which used to be block 0 and
240          * just got bumped because of the addition of a new root node.
241          * There might be three blocks involved if a double split occurred,
242          * and the original block 0 could be at any position in the list.
243          */
244
245         node = oldblk->bp->b_addr;
246         if (node->hdr.info.forw) {
247                 if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
248                         bp = addblk->bp;
249                 } else {
250                         ASSERT(state->extravalid);
251                         bp = state->extrablk.bp;
252                 }
253                 node = bp->b_addr;
254                 node->hdr.info.back = cpu_to_be32(oldblk->blkno);
255                 xfs_trans_log_buf(state->args->trans, bp,
256                     XFS_DA_LOGRANGE(node, &node->hdr.info,
257                     sizeof(node->hdr.info)));
258         }
259         node = oldblk->bp->b_addr;
260         if (node->hdr.info.back) {
261                 if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
262                         bp = addblk->bp;
263                 } else {
264                         ASSERT(state->extravalid);
265                         bp = state->extrablk.bp;
266                 }
267                 node = bp->b_addr;
268                 node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
269                 xfs_trans_log_buf(state->args->trans, bp,
270                     XFS_DA_LOGRANGE(node, &node->hdr.info,
271                     sizeof(node->hdr.info)));
272         }
273         addblk->bp = NULL;
274         return(0);
275 }
276
277 /*
278  * Split the root.  We have to create a new root and point to the two
279  * parts (the split old root) that we just created.  Copy block zero to
280  * the EOF, extending the inode in process.
281  */
282 STATIC int                                              /* error */
283 xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
284                                  xfs_da_state_blk_t *blk2)
285 {
286         xfs_da_intnode_t *node, *oldroot;
287         xfs_da_args_t *args;
288         xfs_dablk_t blkno;
289         struct xfs_buf *bp;
290         int error, size;
291         xfs_inode_t *dp;
292         xfs_trans_t *tp;
293         xfs_mount_t *mp;
294         xfs_dir2_leaf_t *leaf;
295
296         trace_xfs_da_root_split(state->args);
297
298         /*
299          * Copy the existing (incorrect) block from the root node position
300          * to a free space somewhere.
301          */
302         args = state->args;
303         ASSERT(args != NULL);
304         error = xfs_da_grow_inode(args, &blkno);
305         if (error)
306                 return(error);
307         dp = args->dp;
308         tp = args->trans;
309         mp = state->mp;
310         error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
311         if (error)
312                 return(error);
313         ASSERT(bp != NULL);
314         node = bp->b_addr;
315         oldroot = blk1->bp->b_addr;
316         if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) {
317                 size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] -
318                              (char *)oldroot);
319         } else {
320                 ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
321                 leaf = (xfs_dir2_leaf_t *)oldroot;
322                 size = (int)((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] -
323                              (char *)leaf);
324         }
325         memcpy(node, oldroot, size);
326         xfs_trans_log_buf(tp, bp, 0, size - 1);
327         blk1->bp = bp;
328         blk1->blkno = blkno;
329
330         /*
331          * Set up the new root node.
332          */
333         error = xfs_da_node_create(args,
334                 (args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0,
335                 be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork);
336         if (error)
337                 return(error);
338         node = bp->b_addr;
339         node->btree[0].hashval = cpu_to_be32(blk1->hashval);
340         node->btree[0].before = cpu_to_be32(blk1->blkno);
341         node->btree[1].hashval = cpu_to_be32(blk2->hashval);
342         node->btree[1].before = cpu_to_be32(blk2->blkno);
343         node->hdr.count = cpu_to_be16(2);
344
345 #ifdef DEBUG
346         if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) {
347                 ASSERT(blk1->blkno >= mp->m_dirleafblk &&
348                        blk1->blkno < mp->m_dirfreeblk);
349                 ASSERT(blk2->blkno >= mp->m_dirleafblk &&
350                        blk2->blkno < mp->m_dirfreeblk);
351         }
352 #endif
353
354         /* Header is already logged by xfs_da_node_create */
355         xfs_trans_log_buf(tp, bp,
356                 XFS_DA_LOGRANGE(node, node->btree,
357                         sizeof(xfs_da_node_entry_t) * 2));
358
359         return(0);
360 }
361
362 /*
363  * Split the node, rebalance, then add the new entry.
364  */
365 STATIC int                                              /* error */
366 xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
367                                  xfs_da_state_blk_t *newblk,
368                                  xfs_da_state_blk_t *addblk,
369                                  int treelevel, int *result)
370 {
371         xfs_da_intnode_t *node;
372         xfs_dablk_t blkno;
373         int newcount, error;
374         int useextra;
375
376         trace_xfs_da_node_split(state->args);
377
378         node = oldblk->bp->b_addr;
379         ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
380
381         /*
382          * With V2 dirs the extra block is data or freespace.
383          */
384         useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
385         newcount = 1 + useextra;
386         /*
387          * Do we have to split the node?
388          */
389         if ((be16_to_cpu(node->hdr.count) + newcount) > state->node_ents) {
390                 /*
391                  * Allocate a new node, add to the doubly linked chain of
392                  * nodes, then move some of our excess entries into it.
393                  */
394                 error = xfs_da_grow_inode(state->args, &blkno);
395                 if (error)
396                         return(error);  /* GROT: dir is inconsistent */
397
398                 error = xfs_da_node_create(state->args, blkno, treelevel,
399                                            &newblk->bp, state->args->whichfork);
400                 if (error)
401                         return(error);  /* GROT: dir is inconsistent */
402                 newblk->blkno = blkno;
403                 newblk->magic = XFS_DA_NODE_MAGIC;
404                 xfs_da_node_rebalance(state, oldblk, newblk);
405                 error = xfs_da_blk_link(state, oldblk, newblk);
406                 if (error)
407                         return(error);
408                 *result = 1;
409         } else {
410                 *result = 0;
411         }
412
413         /*
414          * Insert the new entry(s) into the correct block
415          * (updating last hashval in the process).
416          *
417          * xfs_da_node_add() inserts BEFORE the given index,
418          * and as a result of using node_lookup_int() we always
419          * point to a valid entry (not after one), but a split
420          * operation always results in a new block whose hashvals
421          * FOLLOW the current block.
422          *
423          * If we had double-split op below us, then add the extra block too.
424          */
425         node = oldblk->bp->b_addr;
426         if (oldblk->index <= be16_to_cpu(node->hdr.count)) {
427                 oldblk->index++;
428                 xfs_da_node_add(state, oldblk, addblk);
429                 if (useextra) {
430                         if (state->extraafter)
431                                 oldblk->index++;
432                         xfs_da_node_add(state, oldblk, &state->extrablk);
433                         state->extravalid = 0;
434                 }
435         } else {
436                 newblk->index++;
437                 xfs_da_node_add(state, newblk, addblk);
438                 if (useextra) {
439                         if (state->extraafter)
440                                 newblk->index++;
441                         xfs_da_node_add(state, newblk, &state->extrablk);
442                         state->extravalid = 0;
443                 }
444         }
445
446         return(0);
447 }
448
449 /*
450  * Balance the btree elements between two intermediate nodes,
451  * usually one full and one empty.
452  *
453  * NOTE: if blk2 is empty, then it will get the upper half of blk1.
454  */
455 STATIC void
456 xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
457                                      xfs_da_state_blk_t *blk2)
458 {
459         xfs_da_intnode_t *node1, *node2, *tmpnode;
460         xfs_da_node_entry_t *btree_s, *btree_d;
461         int count, tmp;
462         xfs_trans_t *tp;
463
464         trace_xfs_da_node_rebalance(state->args);
465
466         node1 = blk1->bp->b_addr;
467         node2 = blk2->bp->b_addr;
468         /*
469          * Figure out how many entries need to move, and in which direction.
470          * Swap the nodes around if that makes it simpler.
471          */
472         if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) &&
473             ((be32_to_cpu(node2->btree[0].hashval) < be32_to_cpu(node1->btree[0].hashval)) ||
474              (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) <
475               be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) {
476                 tmpnode = node1;
477                 node1 = node2;
478                 node2 = tmpnode;
479         }
480         ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
481         ASSERT(node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
482         count = (be16_to_cpu(node1->hdr.count) - be16_to_cpu(node2->hdr.count)) / 2;
483         if (count == 0)
484                 return;
485         tp = state->args->trans;
486         /*
487          * Two cases: high-to-low and low-to-high.
488          */
489         if (count > 0) {
490                 /*
491                  * Move elements in node2 up to make a hole.
492                  */
493                 if ((tmp = be16_to_cpu(node2->hdr.count)) > 0) {
494                         tmp *= (uint)sizeof(xfs_da_node_entry_t);
495                         btree_s = &node2->btree[0];
496                         btree_d = &node2->btree[count];
497                         memmove(btree_d, btree_s, tmp);
498                 }
499
500                 /*
501                  * Move the req'd B-tree elements from high in node1 to
502                  * low in node2.
503                  */
504                 be16_add_cpu(&node2->hdr.count, count);
505                 tmp = count * (uint)sizeof(xfs_da_node_entry_t);
506                 btree_s = &node1->btree[be16_to_cpu(node1->hdr.count) - count];
507                 btree_d = &node2->btree[0];
508                 memcpy(btree_d, btree_s, tmp);
509                 be16_add_cpu(&node1->hdr.count, -count);
510         } else {
511                 /*
512                  * Move the req'd B-tree elements from low in node2 to
513                  * high in node1.
514                  */
515                 count = -count;
516                 tmp = count * (uint)sizeof(xfs_da_node_entry_t);
517                 btree_s = &node2->btree[0];
518                 btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)];
519                 memcpy(btree_d, btree_s, tmp);
520                 be16_add_cpu(&node1->hdr.count, count);
521                 xfs_trans_log_buf(tp, blk1->bp,
522                         XFS_DA_LOGRANGE(node1, btree_d, tmp));
523
524                 /*
525                  * Move elements in node2 down to fill the hole.
526                  */
527                 tmp  = be16_to_cpu(node2->hdr.count) - count;
528                 tmp *= (uint)sizeof(xfs_da_node_entry_t);
529                 btree_s = &node2->btree[count];
530                 btree_d = &node2->btree[0];
531                 memmove(btree_d, btree_s, tmp);
532                 be16_add_cpu(&node2->hdr.count, -count);
533         }
534
535         /*
536          * Log header of node 1 and all current bits of node 2.
537          */
538         xfs_trans_log_buf(tp, blk1->bp,
539                 XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr)));
540         xfs_trans_log_buf(tp, blk2->bp,
541                 XFS_DA_LOGRANGE(node2, &node2->hdr,
542                         sizeof(node2->hdr) +
543                         sizeof(node2->btree[0]) * be16_to_cpu(node2->hdr.count)));
544
545         /*
546          * Record the last hashval from each block for upward propagation.
547          * (note: don't use the swapped node pointers)
548          */
549         node1 = blk1->bp->b_addr;
550         node2 = blk2->bp->b_addr;
551         blk1->hashval = be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval);
552         blk2->hashval = be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval);
553
554         /*
555          * Adjust the expected index for insertion.
556          */
557         if (blk1->index >= be16_to_cpu(node1->hdr.count)) {
558                 blk2->index = blk1->index - be16_to_cpu(node1->hdr.count);
559                 blk1->index = be16_to_cpu(node1->hdr.count) + 1;        /* make it invalid */
560         }
561 }
562
563 /*
564  * Add a new entry to an intermediate node.
565  */
566 STATIC void
567 xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
568                                xfs_da_state_blk_t *newblk)
569 {
570         xfs_da_intnode_t *node;
571         xfs_da_node_entry_t *btree;
572         int tmp;
573
574         trace_xfs_da_node_add(state->args);
575
576         node = oldblk->bp->b_addr;
577         ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
578         ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
579         ASSERT(newblk->blkno != 0);
580         if (state->args->whichfork == XFS_DATA_FORK)
581                 ASSERT(newblk->blkno >= state->mp->m_dirleafblk &&
582                        newblk->blkno < state->mp->m_dirfreeblk);
583
584         /*
585          * We may need to make some room before we insert the new node.
586          */
587         tmp = 0;
588         btree = &node->btree[ oldblk->index ];
589         if (oldblk->index < be16_to_cpu(node->hdr.count)) {
590                 tmp = (be16_to_cpu(node->hdr.count) - oldblk->index) * (uint)sizeof(*btree);
591                 memmove(btree + 1, btree, tmp);
592         }
593         btree->hashval = cpu_to_be32(newblk->hashval);
594         btree->before = cpu_to_be32(newblk->blkno);
595         xfs_trans_log_buf(state->args->trans, oldblk->bp,
596                 XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree)));
597         be16_add_cpu(&node->hdr.count, 1);
598         xfs_trans_log_buf(state->args->trans, oldblk->bp,
599                 XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
600
601         /*
602          * Copy the last hash value from the oldblk to propagate upwards.
603          */
604         oldblk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1 ].hashval);
605 }
606
607 /*========================================================================
608  * Routines used for shrinking the Btree.
609  *========================================================================*/
610
611 /*
612  * Deallocate an empty leaf node, remove it from its parent,
613  * possibly deallocating that block, etc...
614  */
615 int
616 xfs_da_join(xfs_da_state_t *state)
617 {
618         xfs_da_state_blk_t *drop_blk, *save_blk;
619         int action, error;
620
621         trace_xfs_da_join(state->args);
622
623         action = 0;
624         drop_blk = &state->path.blk[ state->path.active-1 ];
625         save_blk = &state->altpath.blk[ state->path.active-1 ];
626         ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
627         ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
628                drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
629
630         /*
631          * Walk back up the tree joining/deallocating as necessary.
632          * When we stop dropping blocks, break out.
633          */
634         for (  ; state->path.active >= 2; drop_blk--, save_blk--,
635                  state->path.active--) {
636                 /*
637                  * See if we can combine the block with a neighbor.
638                  *   (action == 0) => no options, just leave
639                  *   (action == 1) => coalesce, then unlink
640                  *   (action == 2) => block empty, unlink it
641                  */
642                 switch (drop_blk->magic) {
643                 case XFS_ATTR_LEAF_MAGIC:
644                         error = xfs_attr_leaf_toosmall(state, &action);
645                         if (error)
646                                 return(error);
647                         if (action == 0)
648                                 return(0);
649                         xfs_attr_leaf_unbalance(state, drop_blk, save_blk);
650                         break;
651                 case XFS_DIR2_LEAFN_MAGIC:
652                         error = xfs_dir2_leafn_toosmall(state, &action);
653                         if (error)
654                                 return error;
655                         if (action == 0)
656                                 return 0;
657                         xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
658                         break;
659                 case XFS_DA_NODE_MAGIC:
660                         /*
661                          * Remove the offending node, fixup hashvals,
662                          * check for a toosmall neighbor.
663                          */
664                         xfs_da_node_remove(state, drop_blk);
665                         xfs_da_fixhashpath(state, &state->path);
666                         error = xfs_da_node_toosmall(state, &action);
667                         if (error)
668                                 return(error);
669                         if (action == 0)
670                                 return 0;
671                         xfs_da_node_unbalance(state, drop_blk, save_blk);
672                         break;
673                 }
674                 xfs_da_fixhashpath(state, &state->altpath);
675                 error = xfs_da_blk_unlink(state, drop_blk, save_blk);
676                 xfs_da_state_kill_altpath(state);
677                 if (error)
678                         return(error);
679                 error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
680                                                          drop_blk->bp);
681                 drop_blk->bp = NULL;
682                 if (error)
683                         return(error);
684         }
685         /*
686          * We joined all the way to the top.  If it turns out that
687          * we only have one entry in the root, make the child block
688          * the new root.
689          */
690         xfs_da_node_remove(state, drop_blk);
691         xfs_da_fixhashpath(state, &state->path);
692         error = xfs_da_root_join(state, &state->path.blk[0]);
693         return(error);
694 }
695
696 #ifdef  DEBUG
697 static void
698 xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
699 {
700         __be16  magic = blkinfo->magic;
701
702         if (level == 1) {
703                 ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
704                        magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
705         } else
706                 ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
707         ASSERT(!blkinfo->forw);
708         ASSERT(!blkinfo->back);
709 }
710 #else   /* !DEBUG */
711 #define xfs_da_blkinfo_onlychild_validate(blkinfo, level)
712 #endif  /* !DEBUG */
713
714 /*
715  * We have only one entry in the root.  Copy the only remaining child of
716  * the old root to block 0 as the new root node.
717  */
718 STATIC int
719 xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
720 {
721         xfs_da_intnode_t *oldroot;
722         xfs_da_args_t *args;
723         xfs_dablk_t child;
724         struct xfs_buf *bp;
725         int error;
726
727         trace_xfs_da_root_join(state->args);
728
729         args = state->args;
730         ASSERT(args != NULL);
731         ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
732         oldroot = root_blk->bp->b_addr;
733         ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
734         ASSERT(!oldroot->hdr.info.forw);
735         ASSERT(!oldroot->hdr.info.back);
736
737         /*
738          * If the root has more than one child, then don't do anything.
739          */
740         if (be16_to_cpu(oldroot->hdr.count) > 1)
741                 return(0);
742
743         /*
744          * Read in the (only) child block, then copy those bytes into
745          * the root block's buffer and free the original child block.
746          */
747         child = be32_to_cpu(oldroot->btree[0].before);
748         ASSERT(child != 0);
749         error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp,
750                                              args->whichfork);
751         if (error)
752                 return(error);
753         ASSERT(bp != NULL);
754         xfs_da_blkinfo_onlychild_validate(bp->b_addr,
755                                         be16_to_cpu(oldroot->hdr.level));
756
757         memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
758         xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
759         error = xfs_da_shrink_inode(args, child, bp);
760         return(error);
761 }
762
763 /*
764  * Check a node block and its neighbors to see if the block should be
765  * collapsed into one or the other neighbor.  Always keep the block
766  * with the smaller block number.
767  * If the current block is over 50% full, don't try to join it, return 0.
768  * If the block is empty, fill in the state structure and return 2.
769  * If it can be collapsed, fill in the state structure and return 1.
770  * If nothing can be done, return 0.
771  */
772 STATIC int
773 xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
774 {
775         xfs_da_intnode_t *node;
776         xfs_da_state_blk_t *blk;
777         xfs_da_blkinfo_t *info;
778         int count, forward, error, retval, i;
779         xfs_dablk_t blkno;
780         struct xfs_buf *bp;
781
782         /*
783          * Check for the degenerate case of the block being over 50% full.
784          * If so, it's not worth even looking to see if we might be able
785          * to coalesce with a sibling.
786          */
787         blk = &state->path.blk[ state->path.active-1 ];
788         info = blk->bp->b_addr;
789         ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
790         node = (xfs_da_intnode_t *)info;
791         count = be16_to_cpu(node->hdr.count);
792         if (count > (state->node_ents >> 1)) {
793                 *action = 0;    /* blk over 50%, don't try to join */
794                 return(0);      /* blk over 50%, don't try to join */
795         }
796
797         /*
798          * Check for the degenerate case of the block being empty.
799          * If the block is empty, we'll simply delete it, no need to
800          * coalesce it with a sibling block.  We choose (arbitrarily)
801          * to merge with the forward block unless it is NULL.
802          */
803         if (count == 0) {
804                 /*
805                  * Make altpath point to the block we want to keep and
806                  * path point to the block we want to drop (this one).
807                  */
808                 forward = (info->forw != 0);
809                 memcpy(&state->altpath, &state->path, sizeof(state->path));
810                 error = xfs_da_path_shift(state, &state->altpath, forward,
811                                                  0, &retval);
812                 if (error)
813                         return(error);
814                 if (retval) {
815                         *action = 0;
816                 } else {
817                         *action = 2;
818                 }
819                 return(0);
820         }
821
822         /*
823          * Examine each sibling block to see if we can coalesce with
824          * at least 25% free space to spare.  We need to figure out
825          * whether to merge with the forward or the backward block.
826          * We prefer coalescing with the lower numbered sibling so as
827          * to shrink a directory over time.
828          */
829         /* start with smaller blk num */
830         forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back));
831         for (i = 0; i < 2; forward = !forward, i++) {
832                 if (forward)
833                         blkno = be32_to_cpu(info->forw);
834                 else
835                         blkno = be32_to_cpu(info->back);
836                 if (blkno == 0)
837                         continue;
838                 error = xfs_da_read_buf(state->args->trans, state->args->dp,
839                                         blkno, -1, &bp, state->args->whichfork);
840                 if (error)
841                         return(error);
842                 ASSERT(bp != NULL);
843
844                 node = (xfs_da_intnode_t *)info;
845                 count  = state->node_ents;
846                 count -= state->node_ents >> 2;
847                 count -= be16_to_cpu(node->hdr.count);
848                 node = bp->b_addr;
849                 ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
850                 count -= be16_to_cpu(node->hdr.count);
851                 xfs_trans_brelse(state->args->trans, bp);
852                 if (count >= 0)
853                         break;  /* fits with at least 25% to spare */
854         }
855         if (i >= 2) {
856                 *action = 0;
857                 return(0);
858         }
859
860         /*
861          * Make altpath point to the block we want to keep (the lower
862          * numbered block) and path point to the block we want to drop.
863          */
864         memcpy(&state->altpath, &state->path, sizeof(state->path));
865         if (blkno < blk->blkno) {
866                 error = xfs_da_path_shift(state, &state->altpath, forward,
867                                                  0, &retval);
868                 if (error) {
869                         return(error);
870                 }
871                 if (retval) {
872                         *action = 0;
873                         return(0);
874                 }
875         } else {
876                 error = xfs_da_path_shift(state, &state->path, forward,
877                                                  0, &retval);
878                 if (error) {
879                         return(error);
880                 }
881                 if (retval) {
882                         *action = 0;
883                         return(0);
884                 }
885         }
886         *action = 1;
887         return(0);
888 }
889
890 /*
891  * Walk back up the tree adjusting hash values as necessary,
892  * when we stop making changes, return.
893  */
894 void
895 xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
896 {
897         xfs_da_state_blk_t *blk;
898         xfs_da_intnode_t *node;
899         xfs_da_node_entry_t *btree;
900         xfs_dahash_t lasthash=0;
901         int level, count;
902
903         level = path->active-1;
904         blk = &path->blk[ level ];
905         switch (blk->magic) {
906         case XFS_ATTR_LEAF_MAGIC:
907                 lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
908                 if (count == 0)
909                         return;
910                 break;
911         case XFS_DIR2_LEAFN_MAGIC:
912                 lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count);
913                 if (count == 0)
914                         return;
915                 break;
916         case XFS_DA_NODE_MAGIC:
917                 lasthash = xfs_da_node_lasthash(blk->bp, &count);
918                 if (count == 0)
919                         return;
920                 break;
921         }
922         for (blk--, level--; level >= 0; blk--, level--) {
923                 node = blk->bp->b_addr;
924                 ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
925                 btree = &node->btree[ blk->index ];
926                 if (be32_to_cpu(btree->hashval) == lasthash)
927                         break;
928                 blk->hashval = lasthash;
929                 btree->hashval = cpu_to_be32(lasthash);
930                 xfs_trans_log_buf(state->args->trans, blk->bp,
931                                   XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
932
933                 lasthash = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
934         }
935 }
936
937 /*
938  * Remove an entry from an intermediate node.
939  */
940 STATIC void
941 xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
942 {
943         xfs_da_intnode_t *node;
944         xfs_da_node_entry_t *btree;
945         int tmp;
946
947         trace_xfs_da_node_remove(state->args);
948
949         node = drop_blk->bp->b_addr;
950         ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count));
951         ASSERT(drop_blk->index >= 0);
952
953         /*
954          * Copy over the offending entry, or just zero it out.
955          */
956         btree = &node->btree[drop_blk->index];
957         if (drop_blk->index < (be16_to_cpu(node->hdr.count)-1)) {
958                 tmp  = be16_to_cpu(node->hdr.count) - drop_blk->index - 1;
959                 tmp *= (uint)sizeof(xfs_da_node_entry_t);
960                 memmove(btree, btree + 1, tmp);
961                 xfs_trans_log_buf(state->args->trans, drop_blk->bp,
962                     XFS_DA_LOGRANGE(node, btree, tmp));
963                 btree = &node->btree[be16_to_cpu(node->hdr.count)-1];
964         }
965         memset((char *)btree, 0, sizeof(xfs_da_node_entry_t));
966         xfs_trans_log_buf(state->args->trans, drop_blk->bp,
967             XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
968         be16_add_cpu(&node->hdr.count, -1);
969         xfs_trans_log_buf(state->args->trans, drop_blk->bp,
970             XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
971
972         /*
973          * Copy the last hash value from the block to propagate upwards.
974          */
975         btree--;
976         drop_blk->hashval = be32_to_cpu(btree->hashval);
977 }
978
979 /*
980  * Unbalance the btree elements between two intermediate nodes,
981  * move all Btree elements from one node into another.
982  */
983 STATIC void
984 xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
985                                      xfs_da_state_blk_t *save_blk)
986 {
987         xfs_da_intnode_t *drop_node, *save_node;
988         xfs_da_node_entry_t *btree;
989         int tmp;
990         xfs_trans_t *tp;
991
992         trace_xfs_da_node_unbalance(state->args);
993
994         drop_node = drop_blk->bp->b_addr;
995         save_node = save_blk->bp->b_addr;
996         ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
997         ASSERT(save_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
998         tp = state->args->trans;
999
1000         /*
1001          * If the dying block has lower hashvals, then move all the
1002          * elements in the remaining block up to make a hole.
1003          */
1004         if ((be32_to_cpu(drop_node->btree[0].hashval) < be32_to_cpu(save_node->btree[ 0 ].hashval)) ||
1005             (be32_to_cpu(drop_node->btree[be16_to_cpu(drop_node->hdr.count)-1].hashval) <
1006              be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval)))
1007         {
1008                 btree = &save_node->btree[be16_to_cpu(drop_node->hdr.count)];
1009                 tmp = be16_to_cpu(save_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t);
1010                 memmove(btree, &save_node->btree[0], tmp);
1011                 btree = &save_node->btree[0];
1012                 xfs_trans_log_buf(tp, save_blk->bp,
1013                         XFS_DA_LOGRANGE(save_node, btree,
1014                                 (be16_to_cpu(save_node->hdr.count) + be16_to_cpu(drop_node->hdr.count)) *
1015                                 sizeof(xfs_da_node_entry_t)));
1016         } else {
1017                 btree = &save_node->btree[be16_to_cpu(save_node->hdr.count)];
1018                 xfs_trans_log_buf(tp, save_blk->bp,
1019                         XFS_DA_LOGRANGE(save_node, btree,
1020                                 be16_to_cpu(drop_node->hdr.count) *
1021                                 sizeof(xfs_da_node_entry_t)));
1022         }
1023
1024         /*
1025          * Move all the B-tree elements from drop_blk to save_blk.
1026          */
1027         tmp = be16_to_cpu(drop_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t);
1028         memcpy(btree, &drop_node->btree[0], tmp);
1029         be16_add_cpu(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count));
1030
1031         xfs_trans_log_buf(tp, save_blk->bp,
1032                 XFS_DA_LOGRANGE(save_node, &save_node->hdr,
1033                         sizeof(save_node->hdr)));
1034
1035         /*
1036          * Save the last hashval in the remaining block for upward propagation.
1037          */
1038         save_blk->hashval = be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval);
1039 }
1040
1041 /*========================================================================
1042  * Routines used for finding things in the Btree.
1043  *========================================================================*/
1044
1045 /*
1046  * Walk down the Btree looking for a particular filename, filling
1047  * in the state structure as we go.
1048  *
1049  * We will set the state structure to point to each of the elements
1050  * in each of the nodes where either the hashval is or should be.
1051  *
1052  * We support duplicate hashval's so for each entry in the current
1053  * node that could contain the desired hashval, descend.  This is a
1054  * pruned depth-first tree search.
1055  */
1056 int                                                     /* error */
1057 xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
1058 {
1059         xfs_da_state_blk_t *blk;
1060         xfs_da_blkinfo_t *curr;
1061         xfs_da_intnode_t *node;
1062         xfs_da_node_entry_t *btree;
1063         xfs_dablk_t blkno;
1064         int probe, span, max, error, retval;
1065         xfs_dahash_t hashval, btreehashval;
1066         xfs_da_args_t *args;
1067
1068         args = state->args;
1069
1070         /*
1071          * Descend thru the B-tree searching each level for the right
1072          * node to use, until the right hashval is found.
1073          */
1074         blkno = (args->whichfork == XFS_DATA_FORK)? state->mp->m_dirleafblk : 0;
1075         for (blk = &state->path.blk[0], state->path.active = 1;
1076                          state->path.active <= XFS_DA_NODE_MAXDEPTH;
1077                          blk++, state->path.active++) {
1078                 /*
1079                  * Read the next node down in the tree.
1080                  */
1081                 blk->blkno = blkno;
1082                 error = xfs_da_read_buf(args->trans, args->dp, blkno,
1083                                         -1, &blk->bp, args->whichfork);
1084                 if (error) {
1085                         blk->blkno = 0;
1086                         state->path.active--;
1087                         return(error);
1088                 }
1089                 curr = blk->bp->b_addr;
1090                 blk->magic = be16_to_cpu(curr->magic);
1091                 ASSERT(blk->magic == XFS_DA_NODE_MAGIC ||
1092                        blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1093                        blk->magic == XFS_ATTR_LEAF_MAGIC);
1094
1095                 /*
1096                  * Search an intermediate node for a match.
1097                  */
1098                 if (blk->magic == XFS_DA_NODE_MAGIC) {
1099                         node = blk->bp->b_addr;
1100                         max = be16_to_cpu(node->hdr.count);
1101                         blk->hashval = be32_to_cpu(node->btree[max-1].hashval);
1102
1103                         /*
1104                          * Binary search.  (note: small blocks will skip loop)
1105                          */
1106                         probe = span = max / 2;
1107                         hashval = args->hashval;
1108                         for (btree = &node->btree[probe]; span > 4;
1109                                    btree = &node->btree[probe]) {
1110                                 span /= 2;
1111                                 btreehashval = be32_to_cpu(btree->hashval);
1112                                 if (btreehashval < hashval)
1113                                         probe += span;
1114                                 else if (btreehashval > hashval)
1115                                         probe -= span;
1116                                 else
1117                                         break;
1118                         }
1119                         ASSERT((probe >= 0) && (probe < max));
1120                         ASSERT((span <= 4) || (be32_to_cpu(btree->hashval) == hashval));
1121
1122                         /*
1123                          * Since we may have duplicate hashval's, find the first
1124                          * matching hashval in the node.
1125                          */
1126                         while ((probe > 0) && (be32_to_cpu(btree->hashval) >= hashval)) {
1127                                 btree--;
1128                                 probe--;
1129                         }
1130                         while ((probe < max) && (be32_to_cpu(btree->hashval) < hashval)) {
1131                                 btree++;
1132                                 probe++;
1133                         }
1134
1135                         /*
1136                          * Pick the right block to descend on.
1137                          */
1138                         if (probe == max) {
1139                                 blk->index = max-1;
1140                                 blkno = be32_to_cpu(node->btree[max-1].before);
1141                         } else {
1142                                 blk->index = probe;
1143                                 blkno = be32_to_cpu(btree->before);
1144                         }
1145                 } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
1146                         blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
1147                         break;
1148                 } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
1149                         blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
1150                         break;
1151                 }
1152         }
1153
1154         /*
1155          * A leaf block that ends in the hashval that we are interested in
1156          * (final hashval == search hashval) means that the next block may
1157          * contain more entries with the same hashval, shift upward to the
1158          * next leaf and keep searching.
1159          */
1160         for (;;) {
1161                 if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
1162                         retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
1163                                                         &blk->index, state);
1164                 } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
1165                         retval = xfs_attr_leaf_lookup_int(blk->bp, args);
1166                         blk->index = args->index;
1167                         args->blkno = blk->blkno;
1168                 } else {
1169                         ASSERT(0);
1170                         return XFS_ERROR(EFSCORRUPTED);
1171                 }
1172                 if (((retval == ENOENT) || (retval == ENOATTR)) &&
1173                     (blk->hashval == args->hashval)) {
1174                         error = xfs_da_path_shift(state, &state->path, 1, 1,
1175                                                          &retval);
1176                         if (error)
1177                                 return(error);
1178                         if (retval == 0) {
1179                                 continue;
1180                         } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
1181                                 /* path_shift() gives ENOENT */
1182                                 retval = XFS_ERROR(ENOATTR);
1183                         }
1184                 }
1185                 break;
1186         }
1187         *result = retval;
1188         return(0);
1189 }
1190
1191 /*========================================================================
1192  * Utility routines.
1193  *========================================================================*/
1194
1195 /*
1196  * Link a new block into a doubly linked list of blocks (of whatever type).
1197  */
1198 int                                                     /* error */
1199 xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
1200                                xfs_da_state_blk_t *new_blk)
1201 {
1202         xfs_da_blkinfo_t *old_info, *new_info, *tmp_info;
1203         xfs_da_args_t *args;
1204         int before=0, error;
1205         struct xfs_buf *bp;
1206
1207         /*
1208          * Set up environment.
1209          */
1210         args = state->args;
1211         ASSERT(args != NULL);
1212         old_info = old_blk->bp->b_addr;
1213         new_info = new_blk->bp->b_addr;
1214         ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
1215                old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1216                old_blk->magic == XFS_ATTR_LEAF_MAGIC);
1217         ASSERT(old_blk->magic == be16_to_cpu(old_info->magic));
1218         ASSERT(new_blk->magic == be16_to_cpu(new_info->magic));
1219         ASSERT(old_blk->magic == new_blk->magic);
1220
1221         switch (old_blk->magic) {
1222         case XFS_ATTR_LEAF_MAGIC:
1223                 before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
1224                 break;
1225         case XFS_DIR2_LEAFN_MAGIC:
1226                 before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
1227                 break;
1228         case XFS_DA_NODE_MAGIC:
1229                 before = xfs_da_node_order(old_blk->bp, new_blk->bp);
1230                 break;
1231         }
1232
1233         /*
1234          * Link blocks in appropriate order.
1235          */
1236         if (before) {
1237                 /*
1238                  * Link new block in before existing block.
1239                  */
1240                 trace_xfs_da_link_before(args);
1241                 new_info->forw = cpu_to_be32(old_blk->blkno);
1242                 new_info->back = old_info->back;
1243                 if (old_info->back) {
1244                         error = xfs_da_read_buf(args->trans, args->dp,
1245                                                 be32_to_cpu(old_info->back),
1246                                                 -1, &bp, args->whichfork);
1247                         if (error)
1248                                 return(error);
1249                         ASSERT(bp != NULL);
1250                         tmp_info = bp->b_addr;
1251                         ASSERT(be16_to_cpu(tmp_info->magic) == be16_to_cpu(old_info->magic));
1252                         ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
1253                         tmp_info->forw = cpu_to_be32(new_blk->blkno);
1254                         xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
1255                 }
1256                 old_info->back = cpu_to_be32(new_blk->blkno);
1257         } else {
1258                 /*
1259                  * Link new block in after existing block.
1260                  */
1261                 trace_xfs_da_link_after(args);
1262                 new_info->forw = old_info->forw;
1263                 new_info->back = cpu_to_be32(old_blk->blkno);
1264                 if (old_info->forw) {
1265                         error = xfs_da_read_buf(args->trans, args->dp,
1266                                                 be32_to_cpu(old_info->forw),
1267                                                 -1, &bp, args->whichfork);
1268                         if (error)
1269                                 return(error);
1270                         ASSERT(bp != NULL);
1271                         tmp_info = bp->b_addr;
1272                         ASSERT(tmp_info->magic == old_info->magic);
1273                         ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
1274                         tmp_info->back = cpu_to_be32(new_blk->blkno);
1275                         xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
1276                 }
1277                 old_info->forw = cpu_to_be32(new_blk->blkno);
1278         }
1279
1280         xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
1281         xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
1282         return(0);
1283 }
1284
1285 /*
1286  * Compare two intermediate nodes for "order".
1287  */
1288 STATIC int
1289 xfs_da_node_order(
1290         struct xfs_buf  *node1_bp,
1291         struct xfs_buf  *node2_bp)
1292 {
1293         xfs_da_intnode_t *node1, *node2;
1294
1295         node1 = node1_bp->b_addr;
1296         node2 = node2_bp->b_addr;
1297         ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) &&
1298                node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
1299         if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) &&
1300             ((be32_to_cpu(node2->btree[0].hashval) <
1301               be32_to_cpu(node1->btree[0].hashval)) ||
1302              (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) <
1303               be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) {
1304                 return(1);
1305         }
1306         return(0);
1307 }
1308
1309 /*
1310  * Pick up the last hashvalue from an intermediate node.
1311  */
1312 STATIC uint
1313 xfs_da_node_lasthash(
1314         struct xfs_buf  *bp,
1315         int             *count)
1316 {
1317         xfs_da_intnode_t *node;
1318
1319         node = bp->b_addr;
1320         ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
1321         if (count)
1322                 *count = be16_to_cpu(node->hdr.count);
1323         if (!node->hdr.count)
1324                 return(0);
1325         return be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
1326 }
1327
1328 /*
1329  * Unlink a block from a doubly linked list of blocks.
1330  */
1331 STATIC int                                              /* error */
1332 xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
1333                                  xfs_da_state_blk_t *save_blk)
1334 {
1335         xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info;
1336         xfs_da_args_t *args;
1337         struct xfs_buf *bp;
1338         int error;
1339
1340         /*
1341          * Set up environment.
1342          */
1343         args = state->args;
1344         ASSERT(args != NULL);
1345         save_info = save_blk->bp->b_addr;
1346         drop_info = drop_blk->bp->b_addr;
1347         ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
1348                save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1349                save_blk->magic == XFS_ATTR_LEAF_MAGIC);
1350         ASSERT(save_blk->magic == be16_to_cpu(save_info->magic));
1351         ASSERT(drop_blk->magic == be16_to_cpu(drop_info->magic));
1352         ASSERT(save_blk->magic == drop_blk->magic);
1353         ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
1354                (be32_to_cpu(save_info->back) == drop_blk->blkno));
1355         ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) ||
1356                (be32_to_cpu(drop_info->back) == save_blk->blkno));
1357
1358         /*
1359          * Unlink the leaf block from the doubly linked chain of leaves.
1360          */
1361         if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
1362                 trace_xfs_da_unlink_back(args);
1363                 save_info->back = drop_info->back;
1364                 if (drop_info->back) {
1365                         error = xfs_da_read_buf(args->trans, args->dp,
1366                                                 be32_to_cpu(drop_info->back),
1367                                                 -1, &bp, args->whichfork);
1368                         if (error)
1369                                 return(error);
1370                         ASSERT(bp != NULL);
1371                         tmp_info = bp->b_addr;
1372                         ASSERT(tmp_info->magic == save_info->magic);
1373                         ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
1374                         tmp_info->forw = cpu_to_be32(save_blk->blkno);
1375                         xfs_trans_log_buf(args->trans, bp, 0,
1376                                                     sizeof(*tmp_info) - 1);
1377                 }
1378         } else {
1379                 trace_xfs_da_unlink_forward(args);
1380                 save_info->forw = drop_info->forw;
1381                 if (drop_info->forw) {
1382                         error = xfs_da_read_buf(args->trans, args->dp,
1383                                                 be32_to_cpu(drop_info->forw),
1384                                                 -1, &bp, args->whichfork);
1385                         if (error)
1386                                 return(error);
1387                         ASSERT(bp != NULL);
1388                         tmp_info = bp->b_addr;
1389                         ASSERT(tmp_info->magic == save_info->magic);
1390                         ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
1391                         tmp_info->back = cpu_to_be32(save_blk->blkno);
1392                         xfs_trans_log_buf(args->trans, bp, 0,
1393                                                     sizeof(*tmp_info) - 1);
1394                 }
1395         }
1396
1397         xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
1398         return(0);
1399 }
1400
1401 /*
1402  * Move a path "forward" or "!forward" one block at the current level.
1403  *
1404  * This routine will adjust a "path" to point to the next block
1405  * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
1406  * Btree, including updating pointers to the intermediate nodes between
1407  * the new bottom and the root.
1408  */
1409 int                                                     /* error */
1410 xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
1411                                  int forward, int release, int *result)
1412 {
1413         xfs_da_state_blk_t *blk;
1414         xfs_da_blkinfo_t *info;
1415         xfs_da_intnode_t *node;
1416         xfs_da_args_t *args;
1417         xfs_dablk_t blkno=0;
1418         int level, error;
1419
1420         /*
1421          * Roll up the Btree looking for the first block where our
1422          * current index is not at the edge of the block.  Note that
1423          * we skip the bottom layer because we want the sibling block.
1424          */
1425         args = state->args;
1426         ASSERT(args != NULL);
1427         ASSERT(path != NULL);
1428         ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1429         level = (path->active-1) - 1;   /* skip bottom layer in path */
1430         for (blk = &path->blk[level]; level >= 0; blk--, level--) {
1431                 ASSERT(blk->bp != NULL);
1432                 node = blk->bp->b_addr;
1433                 ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
1434                 if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) {
1435                         blk->index++;
1436                         blkno = be32_to_cpu(node->btree[blk->index].before);
1437                         break;
1438                 } else if (!forward && (blk->index > 0)) {
1439                         blk->index--;
1440                         blkno = be32_to_cpu(node->btree[blk->index].before);
1441                         break;
1442                 }
1443         }
1444         if (level < 0) {
1445                 *result = XFS_ERROR(ENOENT);    /* we're out of our tree */
1446                 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
1447                 return(0);
1448         }
1449
1450         /*
1451          * Roll down the edge of the subtree until we reach the
1452          * same depth we were at originally.
1453          */
1454         for (blk++, level++; level < path->active; blk++, level++) {
1455                 /*
1456                  * Release the old block.
1457                  * (if it's dirty, trans won't actually let go)
1458                  */
1459                 if (release)
1460                         xfs_trans_brelse(args->trans, blk->bp);
1461
1462                 /*
1463                  * Read the next child block.
1464                  */
1465                 blk->blkno = blkno;
1466                 error = xfs_da_read_buf(args->trans, args->dp, blkno, -1,
1467                                                      &blk->bp, args->whichfork);
1468                 if (error)
1469                         return(error);
1470                 ASSERT(blk->bp != NULL);
1471                 info = blk->bp->b_addr;
1472                 ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
1473                        info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
1474                        info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1475                 blk->magic = be16_to_cpu(info->magic);
1476                 if (blk->magic == XFS_DA_NODE_MAGIC) {
1477                         node = (xfs_da_intnode_t *)info;
1478                         blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
1479                         if (forward)
1480                                 blk->index = 0;
1481                         else
1482                                 blk->index = be16_to_cpu(node->hdr.count)-1;
1483                         blkno = be32_to_cpu(node->btree[blk->index].before);
1484                 } else {
1485                         ASSERT(level == path->active-1);
1486                         blk->index = 0;
1487                         switch(blk->magic) {
1488                         case XFS_ATTR_LEAF_MAGIC:
1489                                 blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
1490                                                                       NULL);
1491                                 break;
1492                         case XFS_DIR2_LEAFN_MAGIC:
1493                                 blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
1494                                                                        NULL);
1495                                 break;
1496                         default:
1497                                 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC ||
1498                                        blk->magic == XFS_DIR2_LEAFN_MAGIC);
1499                                 break;
1500                         }
1501                 }
1502         }
1503         *result = 0;
1504         return(0);
1505 }
1506
1507
1508 /*========================================================================
1509  * Utility routines.
1510  *========================================================================*/
1511
1512 /*
1513  * Implement a simple hash on a character string.
1514  * Rotate the hash value by 7 bits, then XOR each character in.
1515  * This is implemented with some source-level loop unrolling.
1516  */
1517 xfs_dahash_t
1518 xfs_da_hashname(const __uint8_t *name, int namelen)
1519 {
1520         xfs_dahash_t hash;
1521
1522         /*
1523          * Do four characters at a time as long as we can.
1524          */
1525         for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
1526                 hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
1527                        (name[3] << 0) ^ rol32(hash, 7 * 4);
1528
1529         /*
1530          * Now do the rest of the characters.
1531          */
1532         switch (namelen) {
1533         case 3:
1534                 return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
1535                        rol32(hash, 7 * 3);
1536         case 2:
1537                 return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
1538         case 1:
1539                 return (name[0] << 0) ^ rol32(hash, 7 * 1);
1540         default: /* case 0: */
1541                 return hash;
1542         }
1543 }
1544
1545 enum xfs_dacmp
1546 xfs_da_compname(
1547         struct xfs_da_args *args,
1548         const unsigned char *name,
1549         int             len)
1550 {
1551         return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
1552                                         XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
1553 }
1554
1555 static xfs_dahash_t
1556 xfs_default_hashname(
1557         struct xfs_name *name)
1558 {
1559         return xfs_da_hashname(name->name, name->len);
1560 }
1561
1562 const struct xfs_nameops xfs_default_nameops = {
1563         .hashname       = xfs_default_hashname,
1564         .compname       = xfs_da_compname
1565 };
1566
1567 int
1568 xfs_da_grow_inode_int(
1569         struct xfs_da_args      *args,
1570         xfs_fileoff_t           *bno,
1571         int                     count)
1572 {
1573         struct xfs_trans        *tp = args->trans;
1574         struct xfs_inode        *dp = args->dp;
1575         int                     w = args->whichfork;
1576         xfs_drfsbno_t           nblks = dp->i_d.di_nblocks;
1577         struct xfs_bmbt_irec    map, *mapp;
1578         int                     nmap, error, got, i, mapi;
1579
1580         /*
1581          * Find a spot in the file space to put the new block.
1582          */
1583         error = xfs_bmap_first_unused(tp, dp, count, bno, w);
1584         if (error)
1585                 return error;
1586
1587         /*
1588          * Try mapping it in one filesystem block.
1589          */
1590         nmap = 1;
1591         ASSERT(args->firstblock != NULL);
1592         error = xfs_bmapi_write(tp, dp, *bno, count,
1593                         xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
1594                         args->firstblock, args->total, &map, &nmap,
1595                         args->flist);
1596         if (error)
1597                 return error;
1598
1599         ASSERT(nmap <= 1);
1600         if (nmap == 1) {
1601                 mapp = &map;
1602                 mapi = 1;
1603         } else if (nmap == 0 && count > 1) {
1604                 xfs_fileoff_t           b;
1605                 int                     c;
1606
1607                 /*
1608                  * If we didn't get it and the block might work if fragmented,
1609                  * try without the CONTIG flag.  Loop until we get it all.
1610                  */
1611                 mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
1612                 for (b = *bno, mapi = 0; b < *bno + count; ) {
1613                         nmap = MIN(XFS_BMAP_MAX_NMAP, count);
1614                         c = (int)(*bno + count - b);
1615                         error = xfs_bmapi_write(tp, dp, b, c,
1616                                         xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
1617                                         args->firstblock, args->total,
1618                                         &mapp[mapi], &nmap, args->flist);
1619                         if (error)
1620                                 goto out_free_map;
1621                         if (nmap < 1)
1622                                 break;
1623                         mapi += nmap;
1624                         b = mapp[mapi - 1].br_startoff +
1625                             mapp[mapi - 1].br_blockcount;
1626                 }
1627         } else {
1628                 mapi = 0;
1629                 mapp = NULL;
1630         }
1631
1632         /*
1633          * Count the blocks we got, make sure it matches the total.
1634          */
1635         for (i = 0, got = 0; i < mapi; i++)
1636                 got += mapp[i].br_blockcount;
1637         if (got != count || mapp[0].br_startoff != *bno ||
1638             mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
1639             *bno + count) {
1640                 error = XFS_ERROR(ENOSPC);
1641                 goto out_free_map;
1642         }
1643
1644         /* account for newly allocated blocks in reserved blocks total */
1645         args->total -= dp->i_d.di_nblocks - nblks;
1646
1647 out_free_map:
1648         if (mapp != &map)
1649                 kmem_free(mapp);
1650         return error;
1651 }
1652
1653 /*
1654  * Add a block to the btree ahead of the file.
1655  * Return the new block number to the caller.
1656  */
1657 int
1658 xfs_da_grow_inode(
1659         struct xfs_da_args      *args,
1660         xfs_dablk_t             *new_blkno)
1661 {
1662         xfs_fileoff_t           bno;
1663         int                     count;
1664         int                     error;
1665
1666         trace_xfs_da_grow_inode(args);
1667
1668         if (args->whichfork == XFS_DATA_FORK) {
1669                 bno = args->dp->i_mount->m_dirleafblk;
1670                 count = args->dp->i_mount->m_dirblkfsbs;
1671         } else {
1672                 bno = 0;
1673                 count = 1;
1674         }
1675
1676         error = xfs_da_grow_inode_int(args, &bno, count);
1677         if (!error)
1678                 *new_blkno = (xfs_dablk_t)bno;
1679         return error;
1680 }
1681
1682 /*
1683  * Ick.  We need to always be able to remove a btree block, even
1684  * if there's no space reservation because the filesystem is full.
1685  * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
1686  * It swaps the target block with the last block in the file.  The
1687  * last block in the file can always be removed since it can't cause
1688  * a bmap btree split to do that.
1689  */
1690 STATIC int
1691 xfs_da_swap_lastblock(
1692         xfs_da_args_t   *args,
1693         xfs_dablk_t     *dead_blknop,
1694         struct xfs_buf  **dead_bufp)
1695 {
1696         xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno;
1697         struct xfs_buf *dead_buf, *last_buf, *sib_buf, *par_buf;
1698         xfs_fileoff_t lastoff;
1699         xfs_inode_t *ip;
1700         xfs_trans_t *tp;
1701         xfs_mount_t *mp;
1702         int error, w, entno, level, dead_level;
1703         xfs_da_blkinfo_t *dead_info, *sib_info;
1704         xfs_da_intnode_t *par_node, *dead_node;
1705         xfs_dir2_leaf_t *dead_leaf2;
1706         xfs_dahash_t dead_hash;
1707
1708         trace_xfs_da_swap_lastblock(args);
1709
1710         dead_buf = *dead_bufp;
1711         dead_blkno = *dead_blknop;
1712         tp = args->trans;
1713         ip = args->dp;
1714         w = args->whichfork;
1715         ASSERT(w == XFS_DATA_FORK);
1716         mp = ip->i_mount;
1717         lastoff = mp->m_dirfreeblk;
1718         error = xfs_bmap_last_before(tp, ip, &lastoff, w);
1719         if (error)
1720                 return error;
1721         if (unlikely(lastoff == 0)) {
1722                 XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
1723                                  mp);
1724                 return XFS_ERROR(EFSCORRUPTED);
1725         }
1726         /*
1727          * Read the last block in the btree space.
1728          */
1729         last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
1730         if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w)))
1731                 return error;
1732         /*
1733          * Copy the last block into the dead buffer and log it.
1734          */
1735         memcpy(dead_buf->b_addr, last_buf->b_addr, mp->m_dirblksize);
1736         xfs_trans_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1);
1737         dead_info = dead_buf->b_addr;
1738         /*
1739          * Get values from the moved block.
1740          */
1741         if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) {
1742                 dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
1743                 dead_level = 0;
1744                 dead_hash = be32_to_cpu(dead_leaf2->ents[be16_to_cpu(dead_leaf2->hdr.count) - 1].hashval);
1745         } else {
1746                 ASSERT(dead_info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
1747                 dead_node = (xfs_da_intnode_t *)dead_info;
1748                 dead_level = be16_to_cpu(dead_node->hdr.level);
1749                 dead_hash = be32_to_cpu(dead_node->btree[be16_to_cpu(dead_node->hdr.count) - 1].hashval);
1750         }
1751         sib_buf = par_buf = NULL;
1752         /*
1753          * If the moved block has a left sibling, fix up the pointers.
1754          */
1755         if ((sib_blkno = be32_to_cpu(dead_info->back))) {
1756                 if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
1757                         goto done;
1758                 sib_info = sib_buf->b_addr;
1759                 if (unlikely(
1760                     be32_to_cpu(sib_info->forw) != last_blkno ||
1761                     sib_info->magic != dead_info->magic)) {
1762                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
1763                                          XFS_ERRLEVEL_LOW, mp);
1764                         error = XFS_ERROR(EFSCORRUPTED);
1765                         goto done;
1766                 }
1767                 sib_info->forw = cpu_to_be32(dead_blkno);
1768                 xfs_trans_log_buf(tp, sib_buf,
1769                         XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
1770                                         sizeof(sib_info->forw)));
1771                 sib_buf = NULL;
1772         }
1773         /*
1774          * If the moved block has a right sibling, fix up the pointers.
1775          */
1776         if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
1777                 if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
1778                         goto done;
1779                 sib_info = sib_buf->b_addr;
1780                 if (unlikely(
1781                        be32_to_cpu(sib_info->back) != last_blkno ||
1782                        sib_info->magic != dead_info->magic)) {
1783                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
1784                                          XFS_ERRLEVEL_LOW, mp);
1785                         error = XFS_ERROR(EFSCORRUPTED);
1786                         goto done;
1787                 }
1788                 sib_info->back = cpu_to_be32(dead_blkno);
1789                 xfs_trans_log_buf(tp, sib_buf,
1790                         XFS_DA_LOGRANGE(sib_info, &sib_info->back,
1791                                         sizeof(sib_info->back)));
1792                 sib_buf = NULL;
1793         }
1794         par_blkno = mp->m_dirleafblk;
1795         level = -1;
1796         /*
1797          * Walk down the tree looking for the parent of the moved block.
1798          */
1799         for (;;) {
1800                 if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
1801                         goto done;
1802                 par_node = par_buf->b_addr;
1803                 if (unlikely(par_node->hdr.info.magic !=
1804                     cpu_to_be16(XFS_DA_NODE_MAGIC) ||
1805                     (level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) {
1806                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
1807                                          XFS_ERRLEVEL_LOW, mp);
1808                         error = XFS_ERROR(EFSCORRUPTED);
1809                         goto done;
1810                 }
1811                 level = be16_to_cpu(par_node->hdr.level);
1812                 for (entno = 0;
1813                      entno < be16_to_cpu(par_node->hdr.count) &&
1814                      be32_to_cpu(par_node->btree[entno].hashval) < dead_hash;
1815                      entno++)
1816                         continue;
1817                 if (unlikely(entno == be16_to_cpu(par_node->hdr.count))) {
1818                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
1819                                          XFS_ERRLEVEL_LOW, mp);
1820                         error = XFS_ERROR(EFSCORRUPTED);
1821                         goto done;
1822                 }
1823                 par_blkno = be32_to_cpu(par_node->btree[entno].before);
1824                 if (level == dead_level + 1)
1825                         break;
1826                 xfs_trans_brelse(tp, par_buf);
1827                 par_buf = NULL;
1828         }
1829         /*
1830          * We're in the right parent block.
1831          * Look for the right entry.
1832          */
1833         for (;;) {
1834                 for (;
1835                      entno < be16_to_cpu(par_node->hdr.count) &&
1836                      be32_to_cpu(par_node->btree[entno].before) != last_blkno;
1837                      entno++)
1838                         continue;
1839                 if (entno < be16_to_cpu(par_node->hdr.count))
1840                         break;
1841                 par_blkno = be32_to_cpu(par_node->hdr.info.forw);
1842                 xfs_trans_brelse(tp, par_buf);
1843                 par_buf = NULL;
1844                 if (unlikely(par_blkno == 0)) {
1845                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
1846                                          XFS_ERRLEVEL_LOW, mp);
1847                         error = XFS_ERROR(EFSCORRUPTED);
1848                         goto done;
1849                 }
1850                 if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
1851                         goto done;
1852                 par_node = par_buf->b_addr;
1853                 if (unlikely(
1854                     be16_to_cpu(par_node->hdr.level) != level ||
1855                     par_node->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC))) {
1856                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
1857                                          XFS_ERRLEVEL_LOW, mp);
1858                         error = XFS_ERROR(EFSCORRUPTED);
1859                         goto done;
1860                 }
1861                 entno = 0;
1862         }
1863         /*
1864          * Update the parent entry pointing to the moved block.
1865          */
1866         par_node->btree[entno].before = cpu_to_be32(dead_blkno);
1867         xfs_trans_log_buf(tp, par_buf,
1868                 XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before,
1869                                 sizeof(par_node->btree[entno].before)));
1870         *dead_blknop = last_blkno;
1871         *dead_bufp = last_buf;
1872         return 0;
1873 done:
1874         if (par_buf)
1875                 xfs_trans_brelse(tp, par_buf);
1876         if (sib_buf)
1877                 xfs_trans_brelse(tp, sib_buf);
1878         xfs_trans_brelse(tp, last_buf);
1879         return error;
1880 }
1881
1882 /*
1883  * Remove a btree block from a directory or attribute.
1884  */
1885 int
1886 xfs_da_shrink_inode(
1887         xfs_da_args_t   *args,
1888         xfs_dablk_t     dead_blkno,
1889         struct xfs_buf  *dead_buf)
1890 {
1891         xfs_inode_t *dp;
1892         int done, error, w, count;
1893         xfs_trans_t *tp;
1894         xfs_mount_t *mp;
1895
1896         trace_xfs_da_shrink_inode(args);
1897
1898         dp = args->dp;
1899         w = args->whichfork;
1900         tp = args->trans;
1901         mp = dp->i_mount;
1902         if (w == XFS_DATA_FORK)
1903                 count = mp->m_dirblkfsbs;
1904         else
1905                 count = 1;
1906         for (;;) {
1907                 /*
1908                  * Remove extents.  If we get ENOSPC for a dir we have to move
1909                  * the last block to the place we want to kill.
1910                  */
1911                 if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
1912                                 xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
1913                                 0, args->firstblock, args->flist,
1914                                 &done)) == ENOSPC) {
1915                         if (w != XFS_DATA_FORK)
1916                                 break;
1917                         if ((error = xfs_da_swap_lastblock(args, &dead_blkno,
1918                                         &dead_buf)))
1919                                 break;
1920                 } else {
1921                         break;
1922                 }
1923         }
1924         xfs_trans_binval(tp, dead_buf);
1925         return error;
1926 }
1927
1928 /*
1929  * See if the mapping(s) for this btree block are valid, i.e.
1930  * don't contain holes, are logically contiguous, and cover the whole range.
1931  */
1932 STATIC int
1933 xfs_da_map_covers_blocks(
1934         int             nmap,
1935         xfs_bmbt_irec_t *mapp,
1936         xfs_dablk_t     bno,
1937         int             count)
1938 {
1939         int             i;
1940         xfs_fileoff_t   off;
1941
1942         for (i = 0, off = bno; i < nmap; i++) {
1943                 if (mapp[i].br_startblock == HOLESTARTBLOCK ||
1944                     mapp[i].br_startblock == DELAYSTARTBLOCK) {
1945                         return 0;
1946                 }
1947                 if (off != mapp[i].br_startoff) {
1948                         return 0;
1949                 }
1950                 off += mapp[i].br_blockcount;
1951         }
1952         return off == bno + count;
1953 }
1954
1955 /*
1956  * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map.
1957  *
1958  * For the single map case, it is assumed that the caller has provided a pointer
1959  * to a valid xfs_buf_map.  For the multiple map case, this function will
1960  * allocate the xfs_buf_map to hold all the maps and replace the caller's single
1961  * map pointer with the allocated map.
1962  */
1963 static int
1964 xfs_buf_map_from_irec(
1965         struct xfs_mount        *mp,
1966         struct xfs_buf_map      **mapp,
1967         unsigned int            *nmaps,
1968         struct xfs_bmbt_irec    *irecs,
1969         unsigned int            nirecs)
1970 {
1971         struct xfs_buf_map      *map;
1972         int                     i;
1973
1974         ASSERT(*nmaps == 1);
1975         ASSERT(nirecs >= 1);
1976
1977         if (nirecs > 1) {
1978                 map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP);
1979                 if (!map)
1980                         return ENOMEM;
1981                 *mapp = map;
1982         }
1983
1984         *nmaps = nirecs;
1985         map = *mapp;
1986         for (i = 0; i < *nmaps; i++) {
1987                 ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK &&
1988                        irecs[i].br_startblock != HOLESTARTBLOCK);
1989                 map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
1990                 map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
1991         }
1992         return 0;
1993 }
1994
1995 /*
1996  * Map the block we are given ready for reading. There are three possible return
1997  * values:
1998  *      -1 - will be returned if we land in a hole and mappedbno == -2 so the
1999  *           caller knows not to execute a subsequent read.
2000  *       0 - if we mapped the block successfully
2001  *      >0 - positive error number if there was an error.
2002  */
2003 static int
2004 xfs_dabuf_map(
2005         struct xfs_trans        *trans,
2006         struct xfs_inode        *dp,
2007         xfs_dablk_t             bno,
2008         xfs_daddr_t             mappedbno,
2009         int                     whichfork,
2010         struct xfs_buf_map      **map,
2011         int                     *nmaps)
2012 {
2013         struct xfs_mount        *mp = dp->i_mount;
2014         int                     nfsb;
2015         int                     error = 0;
2016         struct xfs_bmbt_irec    irec;
2017         struct xfs_bmbt_irec    *irecs = &irec;
2018         int                     nirecs;
2019
2020         ASSERT(map && *map);
2021         ASSERT(*nmaps == 1);
2022
2023         nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1;
2024
2025         /*
2026          * Caller doesn't have a mapping.  -2 means don't complain
2027          * if we land in a hole.
2028          */
2029         if (mappedbno == -1 || mappedbno == -2) {
2030                 /*
2031                  * Optimize the one-block case.
2032                  */
2033                 if (nfsb != 1)
2034                         irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP);
2035
2036                 nirecs = nfsb;
2037                 error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
2038                                        &nirecs, xfs_bmapi_aflag(whichfork));
2039                 if (error)
2040                         goto out;
2041         } else {
2042                 irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
2043                 irecs->br_startoff = (xfs_fileoff_t)bno;
2044                 irecs->br_blockcount = nfsb;
2045                 irecs->br_state = 0;
2046                 nirecs = 1;
2047         }
2048
2049         if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
2050                 error = mappedbno == -2 ? -1 : XFS_ERROR(EFSCORRUPTED);
2051                 if (unlikely(error == EFSCORRUPTED)) {
2052                         if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
2053                                 int i;
2054                                 xfs_alert(mp, "%s: bno %lld dir: inode %lld",
2055                                         __func__, (long long)bno,
2056                                         (long long)dp->i_ino);
2057                                 for (i = 0; i < *nmaps; i++) {
2058                                         xfs_alert(mp,
2059 "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
2060                                                 i,
2061                                                 (long long)irecs[i].br_startoff,
2062                                                 (long long)irecs[i].br_startblock,
2063                                                 (long long)irecs[i].br_blockcount,
2064                                                 irecs[i].br_state);
2065                                 }
2066                         }
2067                         XFS_ERROR_REPORT("xfs_da_do_buf(1)",
2068                                          XFS_ERRLEVEL_LOW, mp);
2069                 }
2070                 goto out;
2071         }
2072         error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs);
2073 out:
2074         if (irecs != &irec)
2075                 kmem_free(irecs);
2076         return error;
2077 }
2078
2079 /*
2080  * Get a buffer for the dir/attr block.
2081  */
2082 int
2083 xfs_da_get_buf(
2084         struct xfs_trans        *trans,
2085         struct xfs_inode        *dp,
2086         xfs_dablk_t             bno,
2087         xfs_daddr_t             mappedbno,
2088         struct xfs_buf          **bpp,
2089         int                     whichfork)
2090 {
2091         struct xfs_buf          *bp;
2092         struct xfs_buf_map      map;
2093         struct xfs_buf_map      *mapp;
2094         int                     nmap;
2095         int                     error;
2096
2097         *bpp = NULL;
2098         mapp = &map;
2099         nmap = 1;
2100         error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
2101                                 &mapp, &nmap);
2102         if (error) {
2103                 /* mapping a hole is not an error, but we don't continue */
2104                 if (error == -1)
2105                         error = 0;
2106                 goto out_free;
2107         }
2108
2109         bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
2110                                     mapp, nmap, 0);
2111         error = bp ? bp->b_error : XFS_ERROR(EIO);
2112         if (error) {
2113                 xfs_trans_brelse(trans, bp);
2114                 goto out_free;
2115         }
2116
2117         *bpp = bp;
2118
2119 out_free:
2120         if (mapp != &map)
2121                 kmem_free(mapp);
2122
2123         return error;
2124 }
2125
2126 /*
2127  * Get a buffer for the dir/attr block, fill in the contents.
2128  */
2129 int
2130 xfs_da_read_buf(
2131         struct xfs_trans        *trans,
2132         struct xfs_inode        *dp,
2133         xfs_dablk_t             bno,
2134         xfs_daddr_t             mappedbno,
2135         struct xfs_buf          **bpp,
2136         int                     whichfork)
2137 {
2138         struct xfs_buf          *bp;
2139         struct xfs_buf_map      map;
2140         struct xfs_buf_map      *mapp;
2141         int                     nmap;
2142         int                     error;
2143
2144         *bpp = NULL;
2145         mapp = &map;
2146         nmap = 1;
2147         error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
2148                                 &mapp, &nmap);
2149         if (error) {
2150                 /* mapping a hole is not an error, but we don't continue */
2151                 if (error == -1)
2152                         error = 0;
2153                 goto out_free;
2154         }
2155
2156         error = xfs_trans_read_buf_map(dp->i_mount, trans,
2157                                         dp->i_mount->m_ddev_targp,
2158                                         mapp, nmap, 0, &bp);
2159         if (error)
2160                 goto out_free;
2161
2162         if (whichfork == XFS_ATTR_FORK)
2163                 xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
2164         else
2165                 xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
2166
2167         /*
2168          * This verification code will be moved to a CRC verification callback
2169          * function so just leave it here unchanged until then.
2170          */
2171         {
2172                 xfs_dir2_data_hdr_t     *hdr = bp->b_addr;
2173                 xfs_dir2_free_t         *free = bp->b_addr;
2174                 xfs_da_blkinfo_t        *info = bp->b_addr;
2175                 uint                    magic, magic1;
2176                 struct xfs_mount        *mp = dp->i_mount;
2177
2178                 magic = be16_to_cpu(info->magic);
2179                 magic1 = be32_to_cpu(hdr->magic);
2180                 if (unlikely(
2181                     XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
2182                                    (magic != XFS_ATTR_LEAF_MAGIC) &&
2183                                    (magic != XFS_DIR2_LEAF1_MAGIC) &&
2184                                    (magic != XFS_DIR2_LEAFN_MAGIC) &&
2185                                    (magic1 != XFS_DIR2_BLOCK_MAGIC) &&
2186                                    (magic1 != XFS_DIR2_DATA_MAGIC) &&
2187                                    (free->hdr.magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)),
2188                                 mp, XFS_ERRTAG_DA_READ_BUF,
2189                                 XFS_RANDOM_DA_READ_BUF))) {
2190                         trace_xfs_da_btree_corrupt(bp, _RET_IP_);
2191                         XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)",
2192                                              XFS_ERRLEVEL_LOW, mp, info);
2193                         error = XFS_ERROR(EFSCORRUPTED);
2194                         xfs_trans_brelse(trans, bp);
2195                         goto out_free;
2196                 }
2197         }
2198         *bpp = bp;
2199 out_free:
2200         if (mapp != &map)
2201                 kmem_free(mapp);
2202
2203         return error;
2204 }
2205
2206 /*
2207  * Readahead the dir/attr block.
2208  */
2209 xfs_daddr_t
2210 xfs_da_reada_buf(
2211         struct xfs_trans        *trans,
2212         struct xfs_inode        *dp,
2213         xfs_dablk_t             bno,
2214         int                     whichfork)
2215 {
2216         xfs_daddr_t             mappedbno = -1;
2217         struct xfs_buf_map      map;
2218         struct xfs_buf_map      *mapp;
2219         int                     nmap;
2220         int                     error;
2221
2222         mapp = &map;
2223         nmap = 1;
2224         error = xfs_dabuf_map(trans, dp, bno, -1, whichfork,
2225                                 &mapp, &nmap);
2226         if (error) {
2227                 /* mapping a hole is not an error, but we don't continue */
2228                 if (error == -1)
2229                         error = 0;
2230                 goto out_free;
2231         }
2232
2233         mappedbno = mapp[0].bm_bn;
2234         xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap);
2235
2236 out_free:
2237         if (mapp != &map)
2238                 kmem_free(mapp);
2239
2240         if (error)
2241                 return -1;
2242         return mappedbno;
2243 }
2244
2245 kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
2246
2247 /*
2248  * Allocate a dir-state structure.
2249  * We don't put them on the stack since they're large.
2250  */
2251 xfs_da_state_t *
2252 xfs_da_state_alloc(void)
2253 {
2254         return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
2255 }
2256
2257 /*
2258  * Kill the altpath contents of a da-state structure.
2259  */
2260 STATIC void
2261 xfs_da_state_kill_altpath(xfs_da_state_t *state)
2262 {
2263         int     i;
2264
2265         for (i = 0; i < state->altpath.active; i++)
2266                 state->altpath.blk[i].bp = NULL;
2267         state->altpath.active = 0;
2268 }
2269
2270 /*
2271  * Free a da-state structure.
2272  */
2273 void
2274 xfs_da_state_free(xfs_da_state_t *state)
2275 {
2276         xfs_da_state_kill_altpath(state);
2277 #ifdef DEBUG
2278         memset((char *)state, 0, sizeof(*state));
2279 #endif /* DEBUG */
2280         kmem_zone_free(xfs_da_state_zone, state);
2281 }