]> Pileus Git - ~andy/linux/blob - fs/xfs/xfs_btree.c
Linux 3.14
[~andy/linux] / fs / xfs / xfs_btree.c
1 /*
2  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_shared.h"
21 #include "xfs_format.h"
22 #include "xfs_log_format.h"
23 #include "xfs_trans_resv.h"
24 #include "xfs_bit.h"
25 #include "xfs_sb.h"
26 #include "xfs_ag.h"
27 #include "xfs_mount.h"
28 #include "xfs_inode.h"
29 #include "xfs_trans.h"
30 #include "xfs_inode_item.h"
31 #include "xfs_buf_item.h"
32 #include "xfs_btree.h"
33 #include "xfs_error.h"
34 #include "xfs_trace.h"
35 #include "xfs_cksum.h"
36
37 /*
38  * Cursor allocation zone.
39  */
40 kmem_zone_t     *xfs_btree_cur_zone;
41
42 /*
43  * Btree magic numbers.
44  */
45 static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
46         { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC },
47         { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
48           XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC }
49 };
50 #define xfs_btree_magic(cur) \
51         xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
52
53
54 STATIC int                              /* error (0 or EFSCORRUPTED) */
55 xfs_btree_check_lblock(
56         struct xfs_btree_cur    *cur,   /* btree cursor */
57         struct xfs_btree_block  *block, /* btree long form block pointer */
58         int                     level,  /* level of the btree block */
59         struct xfs_buf          *bp)    /* buffer for block, if any */
60 {
61         int                     lblock_ok = 1; /* block passes checks */
62         struct xfs_mount        *mp;    /* file system mount point */
63
64         mp = cur->bc_mp;
65
66         if (xfs_sb_version_hascrc(&mp->m_sb)) {
67                 lblock_ok = lblock_ok &&
68                         uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
69                         block->bb_u.l.bb_blkno == cpu_to_be64(
70                                 bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
71         }
72
73         lblock_ok = lblock_ok &&
74                 be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
75                 be16_to_cpu(block->bb_level) == level &&
76                 be16_to_cpu(block->bb_numrecs) <=
77                         cur->bc_ops->get_maxrecs(cur, level) &&
78                 block->bb_u.l.bb_leftsib &&
79                 (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
80                  XFS_FSB_SANITY_CHECK(mp,
81                         be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
82                 block->bb_u.l.bb_rightsib &&
83                 (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
84                  XFS_FSB_SANITY_CHECK(mp,
85                         be64_to_cpu(block->bb_u.l.bb_rightsib)));
86
87         if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
88                         XFS_ERRTAG_BTREE_CHECK_LBLOCK,
89                         XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
90                 if (bp)
91                         trace_xfs_btree_corrupt(bp, _RET_IP_);
92                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
93                 return XFS_ERROR(EFSCORRUPTED);
94         }
95         return 0;
96 }
97
98 STATIC int                              /* error (0 or EFSCORRUPTED) */
99 xfs_btree_check_sblock(
100         struct xfs_btree_cur    *cur,   /* btree cursor */
101         struct xfs_btree_block  *block, /* btree short form block pointer */
102         int                     level,  /* level of the btree block */
103         struct xfs_buf          *bp)    /* buffer containing block */
104 {
105         struct xfs_mount        *mp;    /* file system mount point */
106         struct xfs_buf          *agbp;  /* buffer for ag. freespace struct */
107         struct xfs_agf          *agf;   /* ag. freespace structure */
108         xfs_agblock_t           agflen; /* native ag. freespace length */
109         int                     sblock_ok = 1; /* block passes checks */
110
111         mp = cur->bc_mp;
112         agbp = cur->bc_private.a.agbp;
113         agf = XFS_BUF_TO_AGF(agbp);
114         agflen = be32_to_cpu(agf->agf_length);
115
116         if (xfs_sb_version_hascrc(&mp->m_sb)) {
117                 sblock_ok = sblock_ok &&
118                         uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
119                         block->bb_u.s.bb_blkno == cpu_to_be64(
120                                 bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
121         }
122
123         sblock_ok = sblock_ok &&
124                 be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
125                 be16_to_cpu(block->bb_level) == level &&
126                 be16_to_cpu(block->bb_numrecs) <=
127                         cur->bc_ops->get_maxrecs(cur, level) &&
128                 (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
129                  be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
130                 block->bb_u.s.bb_leftsib &&
131                 (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
132                  be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
133                 block->bb_u.s.bb_rightsib;
134
135         if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
136                         XFS_ERRTAG_BTREE_CHECK_SBLOCK,
137                         XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
138                 if (bp)
139                         trace_xfs_btree_corrupt(bp, _RET_IP_);
140                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
141                 return XFS_ERROR(EFSCORRUPTED);
142         }
143         return 0;
144 }
145
146 /*
147  * Debug routine: check that block header is ok.
148  */
149 int
150 xfs_btree_check_block(
151         struct xfs_btree_cur    *cur,   /* btree cursor */
152         struct xfs_btree_block  *block, /* generic btree block pointer */
153         int                     level,  /* level of the btree block */
154         struct xfs_buf          *bp)    /* buffer containing block, if any */
155 {
156         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
157                 return xfs_btree_check_lblock(cur, block, level, bp);
158         else
159                 return xfs_btree_check_sblock(cur, block, level, bp);
160 }
161
162 /*
163  * Check that (long) pointer is ok.
164  */
165 int                                     /* error (0 or EFSCORRUPTED) */
166 xfs_btree_check_lptr(
167         struct xfs_btree_cur    *cur,   /* btree cursor */
168         xfs_dfsbno_t            bno,    /* btree block disk address */
169         int                     level)  /* btree block level */
170 {
171         XFS_WANT_CORRUPTED_RETURN(
172                 level > 0 &&
173                 bno != NULLDFSBNO &&
174                 XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
175         return 0;
176 }
177
178 #ifdef DEBUG
179 /*
180  * Check that (short) pointer is ok.
181  */
182 STATIC int                              /* error (0 or EFSCORRUPTED) */
183 xfs_btree_check_sptr(
184         struct xfs_btree_cur    *cur,   /* btree cursor */
185         xfs_agblock_t           bno,    /* btree block disk address */
186         int                     level)  /* btree block level */
187 {
188         xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
189
190         XFS_WANT_CORRUPTED_RETURN(
191                 level > 0 &&
192                 bno != NULLAGBLOCK &&
193                 bno != 0 &&
194                 bno < agblocks);
195         return 0;
196 }
197
198 /*
199  * Check that block ptr is ok.
200  */
201 STATIC int                              /* error (0 or EFSCORRUPTED) */
202 xfs_btree_check_ptr(
203         struct xfs_btree_cur    *cur,   /* btree cursor */
204         union xfs_btree_ptr     *ptr,   /* btree block disk address */
205         int                     index,  /* offset from ptr to check */
206         int                     level)  /* btree block level */
207 {
208         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
209                 return xfs_btree_check_lptr(cur,
210                                 be64_to_cpu((&ptr->l)[index]), level);
211         } else {
212                 return xfs_btree_check_sptr(cur,
213                                 be32_to_cpu((&ptr->s)[index]), level);
214         }
215 }
216 #endif
217
218 /*
219  * Calculate CRC on the whole btree block and stuff it into the
220  * long-form btree header.
221  *
222  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
223  * it into the buffer so recovery knows what the last modifcation was that made
224  * it to disk.
225  */
226 void
227 xfs_btree_lblock_calc_crc(
228         struct xfs_buf          *bp)
229 {
230         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
231         struct xfs_buf_log_item *bip = bp->b_fspriv;
232
233         if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
234                 return;
235         if (bip)
236                 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
237         xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
238                          XFS_BTREE_LBLOCK_CRC_OFF);
239 }
240
241 bool
242 xfs_btree_lblock_verify_crc(
243         struct xfs_buf          *bp)
244 {
245         if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
246                 return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
247                                         XFS_BTREE_LBLOCK_CRC_OFF);
248         return true;
249 }
250
251 /*
252  * Calculate CRC on the whole btree block and stuff it into the
253  * short-form btree header.
254  *
255  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
256  * it into the buffer so recovery knows what the last modifcation was that made
257  * it to disk.
258  */
259 void
260 xfs_btree_sblock_calc_crc(
261         struct xfs_buf          *bp)
262 {
263         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
264         struct xfs_buf_log_item *bip = bp->b_fspriv;
265
266         if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
267                 return;
268         if (bip)
269                 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
270         xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
271                          XFS_BTREE_SBLOCK_CRC_OFF);
272 }
273
274 bool
275 xfs_btree_sblock_verify_crc(
276         struct xfs_buf          *bp)
277 {
278         if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
279                 return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
280                                         XFS_BTREE_SBLOCK_CRC_OFF);
281         return true;
282 }
283
284 /*
285  * Delete the btree cursor.
286  */
287 void
288 xfs_btree_del_cursor(
289         xfs_btree_cur_t *cur,           /* btree cursor */
290         int             error)          /* del because of error */
291 {
292         int             i;              /* btree level */
293
294         /*
295          * Clear the buffer pointers, and release the buffers.
296          * If we're doing this in the face of an error, we
297          * need to make sure to inspect all of the entries
298          * in the bc_bufs array for buffers to be unlocked.
299          * This is because some of the btree code works from
300          * level n down to 0, and if we get an error along
301          * the way we won't have initialized all the entries
302          * down to 0.
303          */
304         for (i = 0; i < cur->bc_nlevels; i++) {
305                 if (cur->bc_bufs[i])
306                         xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
307                 else if (!error)
308                         break;
309         }
310         /*
311          * Can't free a bmap cursor without having dealt with the
312          * allocated indirect blocks' accounting.
313          */
314         ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
315                cur->bc_private.b.allocated == 0);
316         /*
317          * Free the cursor.
318          */
319         kmem_zone_free(xfs_btree_cur_zone, cur);
320 }
321
322 /*
323  * Duplicate the btree cursor.
324  * Allocate a new one, copy the record, re-get the buffers.
325  */
326 int                                     /* error */
327 xfs_btree_dup_cursor(
328         xfs_btree_cur_t *cur,           /* input cursor */
329         xfs_btree_cur_t **ncur)         /* output cursor */
330 {
331         xfs_buf_t       *bp;            /* btree block's buffer pointer */
332         int             error;          /* error return value */
333         int             i;              /* level number of btree block */
334         xfs_mount_t     *mp;            /* mount structure for filesystem */
335         xfs_btree_cur_t *new;           /* new cursor value */
336         xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
337
338         tp = cur->bc_tp;
339         mp = cur->bc_mp;
340
341         /*
342          * Allocate a new cursor like the old one.
343          */
344         new = cur->bc_ops->dup_cursor(cur);
345
346         /*
347          * Copy the record currently in the cursor.
348          */
349         new->bc_rec = cur->bc_rec;
350
351         /*
352          * For each level current, re-get the buffer and copy the ptr value.
353          */
354         for (i = 0; i < new->bc_nlevels; i++) {
355                 new->bc_ptrs[i] = cur->bc_ptrs[i];
356                 new->bc_ra[i] = cur->bc_ra[i];
357                 bp = cur->bc_bufs[i];
358                 if (bp) {
359                         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
360                                                    XFS_BUF_ADDR(bp), mp->m_bsize,
361                                                    0, &bp,
362                                                    cur->bc_ops->buf_ops);
363                         if (error) {
364                                 xfs_btree_del_cursor(new, error);
365                                 *ncur = NULL;
366                                 return error;
367                         }
368                 }
369                 new->bc_bufs[i] = bp;
370         }
371         *ncur = new;
372         return 0;
373 }
374
375 /*
376  * XFS btree block layout and addressing:
377  *
378  * There are two types of blocks in the btree: leaf and non-leaf blocks.
379  *
380  * The leaf record start with a header then followed by records containing
381  * the values.  A non-leaf block also starts with the same header, and
382  * then first contains lookup keys followed by an equal number of pointers
383  * to the btree blocks at the previous level.
384  *
385  *              +--------+-------+-------+-------+-------+-------+-------+
386  * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
387  *              +--------+-------+-------+-------+-------+-------+-------+
388  *
389  *              +--------+-------+-------+-------+-------+-------+-------+
390  * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
391  *              +--------+-------+-------+-------+-------+-------+-------+
392  *
393  * The header is called struct xfs_btree_block for reasons better left unknown
394  * and comes in different versions for short (32bit) and long (64bit) block
395  * pointers.  The record and key structures are defined by the btree instances
396  * and opaque to the btree core.  The block pointers are simple disk endian
397  * integers, available in a short (32bit) and long (64bit) variant.
398  *
399  * The helpers below calculate the offset of a given record, key or pointer
400  * into a btree block (xfs_btree_*_offset) or return a pointer to the given
401  * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
402  * inside the btree block is done using indices starting at one, not zero!
403  */
404
405 /*
406  * Return size of the btree block header for this btree instance.
407  */
408 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
409 {
410         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
411                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
412                         return XFS_BTREE_LBLOCK_CRC_LEN;
413                 return XFS_BTREE_LBLOCK_LEN;
414         }
415         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
416                 return XFS_BTREE_SBLOCK_CRC_LEN;
417         return XFS_BTREE_SBLOCK_LEN;
418 }
419
420 /*
421  * Return size of btree block pointers for this btree instance.
422  */
423 static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
424 {
425         return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
426                 sizeof(__be64) : sizeof(__be32);
427 }
428
429 /*
430  * Calculate offset of the n-th record in a btree block.
431  */
432 STATIC size_t
433 xfs_btree_rec_offset(
434         struct xfs_btree_cur    *cur,
435         int                     n)
436 {
437         return xfs_btree_block_len(cur) +
438                 (n - 1) * cur->bc_ops->rec_len;
439 }
440
441 /*
442  * Calculate offset of the n-th key in a btree block.
443  */
444 STATIC size_t
445 xfs_btree_key_offset(
446         struct xfs_btree_cur    *cur,
447         int                     n)
448 {
449         return xfs_btree_block_len(cur) +
450                 (n - 1) * cur->bc_ops->key_len;
451 }
452
453 /*
454  * Calculate offset of the n-th block pointer in a btree block.
455  */
456 STATIC size_t
457 xfs_btree_ptr_offset(
458         struct xfs_btree_cur    *cur,
459         int                     n,
460         int                     level)
461 {
462         return xfs_btree_block_len(cur) +
463                 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
464                 (n - 1) * xfs_btree_ptr_len(cur);
465 }
466
467 /*
468  * Return a pointer to the n-th record in the btree block.
469  */
470 STATIC union xfs_btree_rec *
471 xfs_btree_rec_addr(
472         struct xfs_btree_cur    *cur,
473         int                     n,
474         struct xfs_btree_block  *block)
475 {
476         return (union xfs_btree_rec *)
477                 ((char *)block + xfs_btree_rec_offset(cur, n));
478 }
479
480 /*
481  * Return a pointer to the n-th key in the btree block.
482  */
483 STATIC union xfs_btree_key *
484 xfs_btree_key_addr(
485         struct xfs_btree_cur    *cur,
486         int                     n,
487         struct xfs_btree_block  *block)
488 {
489         return (union xfs_btree_key *)
490                 ((char *)block + xfs_btree_key_offset(cur, n));
491 }
492
493 /*
494  * Return a pointer to the n-th block pointer in the btree block.
495  */
496 STATIC union xfs_btree_ptr *
497 xfs_btree_ptr_addr(
498         struct xfs_btree_cur    *cur,
499         int                     n,
500         struct xfs_btree_block  *block)
501 {
502         int                     level = xfs_btree_get_level(block);
503
504         ASSERT(block->bb_level != 0);
505
506         return (union xfs_btree_ptr *)
507                 ((char *)block + xfs_btree_ptr_offset(cur, n, level));
508 }
509
510 /*
511  * Get the root block which is stored in the inode.
512  *
513  * For now this btree implementation assumes the btree root is always
514  * stored in the if_broot field of an inode fork.
515  */
516 STATIC struct xfs_btree_block *
517 xfs_btree_get_iroot(
518        struct xfs_btree_cur    *cur)
519 {
520        struct xfs_ifork        *ifp;
521
522        ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
523        return (struct xfs_btree_block *)ifp->if_broot;
524 }
525
526 /*
527  * Retrieve the block pointer from the cursor at the given level.
528  * This may be an inode btree root or from a buffer.
529  */
530 STATIC struct xfs_btree_block *         /* generic btree block pointer */
531 xfs_btree_get_block(
532         struct xfs_btree_cur    *cur,   /* btree cursor */
533         int                     level,  /* level in btree */
534         struct xfs_buf          **bpp)  /* buffer containing the block */
535 {
536         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
537             (level == cur->bc_nlevels - 1)) {
538                 *bpp = NULL;
539                 return xfs_btree_get_iroot(cur);
540         }
541
542         *bpp = cur->bc_bufs[level];
543         return XFS_BUF_TO_BLOCK(*bpp);
544 }
545
546 /*
547  * Get a buffer for the block, return it with no data read.
548  * Long-form addressing.
549  */
550 xfs_buf_t *                             /* buffer for fsbno */
551 xfs_btree_get_bufl(
552         xfs_mount_t     *mp,            /* file system mount point */
553         xfs_trans_t     *tp,            /* transaction pointer */
554         xfs_fsblock_t   fsbno,          /* file system block number */
555         uint            lock)           /* lock flags for get_buf */
556 {
557         xfs_buf_t       *bp;            /* buffer pointer (return value) */
558         xfs_daddr_t             d;              /* real disk block address */
559
560         ASSERT(fsbno != NULLFSBLOCK);
561         d = XFS_FSB_TO_DADDR(mp, fsbno);
562         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
563         ASSERT(!xfs_buf_geterror(bp));
564         return bp;
565 }
566
567 /*
568  * Get a buffer for the block, return it with no data read.
569  * Short-form addressing.
570  */
571 xfs_buf_t *                             /* buffer for agno/agbno */
572 xfs_btree_get_bufs(
573         xfs_mount_t     *mp,            /* file system mount point */
574         xfs_trans_t     *tp,            /* transaction pointer */
575         xfs_agnumber_t  agno,           /* allocation group number */
576         xfs_agblock_t   agbno,          /* allocation group block number */
577         uint            lock)           /* lock flags for get_buf */
578 {
579         xfs_buf_t       *bp;            /* buffer pointer (return value) */
580         xfs_daddr_t             d;              /* real disk block address */
581
582         ASSERT(agno != NULLAGNUMBER);
583         ASSERT(agbno != NULLAGBLOCK);
584         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
585         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
586         ASSERT(!xfs_buf_geterror(bp));
587         return bp;
588 }
589
590 /*
591  * Check for the cursor referring to the last block at the given level.
592  */
593 int                                     /* 1=is last block, 0=not last block */
594 xfs_btree_islastblock(
595         xfs_btree_cur_t         *cur,   /* btree cursor */
596         int                     level)  /* level to check */
597 {
598         struct xfs_btree_block  *block; /* generic btree block pointer */
599         xfs_buf_t               *bp;    /* buffer containing block */
600
601         block = xfs_btree_get_block(cur, level, &bp);
602         xfs_btree_check_block(cur, block, level, bp);
603         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
604                 return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO);
605         else
606                 return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
607 }
608
609 /*
610  * Change the cursor to point to the first record at the given level.
611  * Other levels are unaffected.
612  */
613 STATIC int                              /* success=1, failure=0 */
614 xfs_btree_firstrec(
615         xfs_btree_cur_t         *cur,   /* btree cursor */
616         int                     level)  /* level to change */
617 {
618         struct xfs_btree_block  *block; /* generic btree block pointer */
619         xfs_buf_t               *bp;    /* buffer containing block */
620
621         /*
622          * Get the block pointer for this level.
623          */
624         block = xfs_btree_get_block(cur, level, &bp);
625         xfs_btree_check_block(cur, block, level, bp);
626         /*
627          * It's empty, there is no such record.
628          */
629         if (!block->bb_numrecs)
630                 return 0;
631         /*
632          * Set the ptr value to 1, that's the first record/key.
633          */
634         cur->bc_ptrs[level] = 1;
635         return 1;
636 }
637
638 /*
639  * Change the cursor to point to the last record in the current block
640  * at the given level.  Other levels are unaffected.
641  */
642 STATIC int                              /* success=1, failure=0 */
643 xfs_btree_lastrec(
644         xfs_btree_cur_t         *cur,   /* btree cursor */
645         int                     level)  /* level to change */
646 {
647         struct xfs_btree_block  *block; /* generic btree block pointer */
648         xfs_buf_t               *bp;    /* buffer containing block */
649
650         /*
651          * Get the block pointer for this level.
652          */
653         block = xfs_btree_get_block(cur, level, &bp);
654         xfs_btree_check_block(cur, block, level, bp);
655         /*
656          * It's empty, there is no such record.
657          */
658         if (!block->bb_numrecs)
659                 return 0;
660         /*
661          * Set the ptr value to numrecs, that's the last record/key.
662          */
663         cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
664         return 1;
665 }
666
667 /*
668  * Compute first and last byte offsets for the fields given.
669  * Interprets the offsets table, which contains struct field offsets.
670  */
671 void
672 xfs_btree_offsets(
673         __int64_t       fields,         /* bitmask of fields */
674         const short     *offsets,       /* table of field offsets */
675         int             nbits,          /* number of bits to inspect */
676         int             *first,         /* output: first byte offset */
677         int             *last)          /* output: last byte offset */
678 {
679         int             i;              /* current bit number */
680         __int64_t       imask;          /* mask for current bit number */
681
682         ASSERT(fields != 0);
683         /*
684          * Find the lowest bit, so the first byte offset.
685          */
686         for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
687                 if (imask & fields) {
688                         *first = offsets[i];
689                         break;
690                 }
691         }
692         /*
693          * Find the highest bit, so the last byte offset.
694          */
695         for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
696                 if (imask & fields) {
697                         *last = offsets[i + 1] - 1;
698                         break;
699                 }
700         }
701 }
702
703 /*
704  * Get a buffer for the block, return it read in.
705  * Long-form addressing.
706  */
707 int
708 xfs_btree_read_bufl(
709         struct xfs_mount        *mp,            /* file system mount point */
710         struct xfs_trans        *tp,            /* transaction pointer */
711         xfs_fsblock_t           fsbno,          /* file system block number */
712         uint                    lock,           /* lock flags for read_buf */
713         struct xfs_buf          **bpp,          /* buffer for fsbno */
714         int                     refval,         /* ref count value for buffer */
715         const struct xfs_buf_ops *ops)
716 {
717         struct xfs_buf          *bp;            /* return value */
718         xfs_daddr_t             d;              /* real disk block address */
719         int                     error;
720
721         ASSERT(fsbno != NULLFSBLOCK);
722         d = XFS_FSB_TO_DADDR(mp, fsbno);
723         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
724                                    mp->m_bsize, lock, &bp, ops);
725         if (error)
726                 return error;
727         ASSERT(!xfs_buf_geterror(bp));
728         if (bp)
729                 xfs_buf_set_ref(bp, refval);
730         *bpp = bp;
731         return 0;
732 }
733
734 /*
735  * Read-ahead the block, don't wait for it, don't return a buffer.
736  * Long-form addressing.
737  */
738 /* ARGSUSED */
739 void
740 xfs_btree_reada_bufl(
741         struct xfs_mount        *mp,            /* file system mount point */
742         xfs_fsblock_t           fsbno,          /* file system block number */
743         xfs_extlen_t            count,          /* count of filesystem blocks */
744         const struct xfs_buf_ops *ops)
745 {
746         xfs_daddr_t             d;
747
748         ASSERT(fsbno != NULLFSBLOCK);
749         d = XFS_FSB_TO_DADDR(mp, fsbno);
750         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
751 }
752
753 /*
754  * Read-ahead the block, don't wait for it, don't return a buffer.
755  * Short-form addressing.
756  */
757 /* ARGSUSED */
758 void
759 xfs_btree_reada_bufs(
760         struct xfs_mount        *mp,            /* file system mount point */
761         xfs_agnumber_t          agno,           /* allocation group number */
762         xfs_agblock_t           agbno,          /* allocation group block number */
763         xfs_extlen_t            count,          /* count of filesystem blocks */
764         const struct xfs_buf_ops *ops)
765 {
766         xfs_daddr_t             d;
767
768         ASSERT(agno != NULLAGNUMBER);
769         ASSERT(agbno != NULLAGBLOCK);
770         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
771         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
772 }
773
774 STATIC int
775 xfs_btree_readahead_lblock(
776         struct xfs_btree_cur    *cur,
777         int                     lr,
778         struct xfs_btree_block  *block)
779 {
780         int                     rval = 0;
781         xfs_dfsbno_t            left = be64_to_cpu(block->bb_u.l.bb_leftsib);
782         xfs_dfsbno_t            right = be64_to_cpu(block->bb_u.l.bb_rightsib);
783
784         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
785                 xfs_btree_reada_bufl(cur->bc_mp, left, 1,
786                                      cur->bc_ops->buf_ops);
787                 rval++;
788         }
789
790         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
791                 xfs_btree_reada_bufl(cur->bc_mp, right, 1,
792                                      cur->bc_ops->buf_ops);
793                 rval++;
794         }
795
796         return rval;
797 }
798
799 STATIC int
800 xfs_btree_readahead_sblock(
801         struct xfs_btree_cur    *cur,
802         int                     lr,
803         struct xfs_btree_block *block)
804 {
805         int                     rval = 0;
806         xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
807         xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
808
809
810         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
811                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
812                                      left, 1, cur->bc_ops->buf_ops);
813                 rval++;
814         }
815
816         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
817                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
818                                      right, 1, cur->bc_ops->buf_ops);
819                 rval++;
820         }
821
822         return rval;
823 }
824
825 /*
826  * Read-ahead btree blocks, at the given level.
827  * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
828  */
829 STATIC int
830 xfs_btree_readahead(
831         struct xfs_btree_cur    *cur,           /* btree cursor */
832         int                     lev,            /* level in btree */
833         int                     lr)             /* left/right bits */
834 {
835         struct xfs_btree_block  *block;
836
837         /*
838          * No readahead needed if we are at the root level and the
839          * btree root is stored in the inode.
840          */
841         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
842             (lev == cur->bc_nlevels - 1))
843                 return 0;
844
845         if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
846                 return 0;
847
848         cur->bc_ra[lev] |= lr;
849         block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
850
851         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
852                 return xfs_btree_readahead_lblock(cur, lr, block);
853         return xfs_btree_readahead_sblock(cur, lr, block);
854 }
855
856 STATIC xfs_daddr_t
857 xfs_btree_ptr_to_daddr(
858         struct xfs_btree_cur    *cur,
859         union xfs_btree_ptr     *ptr)
860 {
861         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
862                 ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
863
864                 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
865         } else {
866                 ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
867                 ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
868
869                 return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
870                                         be32_to_cpu(ptr->s));
871         }
872 }
873
874 /*
875  * Readahead @count btree blocks at the given @ptr location.
876  *
877  * We don't need to care about long or short form btrees here as we have a
878  * method of converting the ptr directly to a daddr available to us.
879  */
880 STATIC void
881 xfs_btree_readahead_ptr(
882         struct xfs_btree_cur    *cur,
883         union xfs_btree_ptr     *ptr,
884         xfs_extlen_t            count)
885 {
886         xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
887                           xfs_btree_ptr_to_daddr(cur, ptr),
888                           cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
889 }
890
891 /*
892  * Set the buffer for level "lev" in the cursor to bp, releasing
893  * any previous buffer.
894  */
895 STATIC void
896 xfs_btree_setbuf(
897         xfs_btree_cur_t         *cur,   /* btree cursor */
898         int                     lev,    /* level in btree */
899         xfs_buf_t               *bp)    /* new buffer to set */
900 {
901         struct xfs_btree_block  *b;     /* btree block */
902
903         if (cur->bc_bufs[lev])
904                 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
905         cur->bc_bufs[lev] = bp;
906         cur->bc_ra[lev] = 0;
907
908         b = XFS_BUF_TO_BLOCK(bp);
909         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
910                 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO))
911                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
912                 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO))
913                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
914         } else {
915                 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
916                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
917                 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
918                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
919         }
920 }
921
922 STATIC int
923 xfs_btree_ptr_is_null(
924         struct xfs_btree_cur    *cur,
925         union xfs_btree_ptr     *ptr)
926 {
927         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
928                 return ptr->l == cpu_to_be64(NULLDFSBNO);
929         else
930                 return ptr->s == cpu_to_be32(NULLAGBLOCK);
931 }
932
933 STATIC void
934 xfs_btree_set_ptr_null(
935         struct xfs_btree_cur    *cur,
936         union xfs_btree_ptr     *ptr)
937 {
938         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
939                 ptr->l = cpu_to_be64(NULLDFSBNO);
940         else
941                 ptr->s = cpu_to_be32(NULLAGBLOCK);
942 }
943
944 /*
945  * Get/set/init sibling pointers
946  */
947 STATIC void
948 xfs_btree_get_sibling(
949         struct xfs_btree_cur    *cur,
950         struct xfs_btree_block  *block,
951         union xfs_btree_ptr     *ptr,
952         int                     lr)
953 {
954         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
955
956         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
957                 if (lr == XFS_BB_RIGHTSIB)
958                         ptr->l = block->bb_u.l.bb_rightsib;
959                 else
960                         ptr->l = block->bb_u.l.bb_leftsib;
961         } else {
962                 if (lr == XFS_BB_RIGHTSIB)
963                         ptr->s = block->bb_u.s.bb_rightsib;
964                 else
965                         ptr->s = block->bb_u.s.bb_leftsib;
966         }
967 }
968
969 STATIC void
970 xfs_btree_set_sibling(
971         struct xfs_btree_cur    *cur,
972         struct xfs_btree_block  *block,
973         union xfs_btree_ptr     *ptr,
974         int                     lr)
975 {
976         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
977
978         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
979                 if (lr == XFS_BB_RIGHTSIB)
980                         block->bb_u.l.bb_rightsib = ptr->l;
981                 else
982                         block->bb_u.l.bb_leftsib = ptr->l;
983         } else {
984                 if (lr == XFS_BB_RIGHTSIB)
985                         block->bb_u.s.bb_rightsib = ptr->s;
986                 else
987                         block->bb_u.s.bb_leftsib = ptr->s;
988         }
989 }
990
991 void
992 xfs_btree_init_block_int(
993         struct xfs_mount        *mp,
994         struct xfs_btree_block  *buf,
995         xfs_daddr_t             blkno,
996         __u32                   magic,
997         __u16                   level,
998         __u16                   numrecs,
999         __u64                   owner,
1000         unsigned int            flags)
1001 {
1002         buf->bb_magic = cpu_to_be32(magic);
1003         buf->bb_level = cpu_to_be16(level);
1004         buf->bb_numrecs = cpu_to_be16(numrecs);
1005
1006         if (flags & XFS_BTREE_LONG_PTRS) {
1007                 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
1008                 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
1009                 if (flags & XFS_BTREE_CRC_BLOCKS) {
1010                         buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1011                         buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1012                         uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
1013                         buf->bb_u.l.bb_pad = 0;
1014                         buf->bb_u.l.bb_lsn = 0;
1015                 }
1016         } else {
1017                 /* owner is a 32 bit value on short blocks */
1018                 __u32 __owner = (__u32)owner;
1019
1020                 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1021                 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1022                 if (flags & XFS_BTREE_CRC_BLOCKS) {
1023                         buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1024                         buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1025                         uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
1026                         buf->bb_u.s.bb_lsn = 0;
1027                 }
1028         }
1029 }
1030
1031 void
1032 xfs_btree_init_block(
1033         struct xfs_mount *mp,
1034         struct xfs_buf  *bp,
1035         __u32           magic,
1036         __u16           level,
1037         __u16           numrecs,
1038         __u64           owner,
1039         unsigned int    flags)
1040 {
1041         xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1042                                  magic, level, numrecs, owner, flags);
1043 }
1044
1045 STATIC void
1046 xfs_btree_init_block_cur(
1047         struct xfs_btree_cur    *cur,
1048         struct xfs_buf          *bp,
1049         int                     level,
1050         int                     numrecs)
1051 {
1052         __u64 owner;
1053
1054         /*
1055          * we can pull the owner from the cursor right now as the different
1056          * owners align directly with the pointer size of the btree. This may
1057          * change in future, but is safe for current users of the generic btree
1058          * code.
1059          */
1060         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1061                 owner = cur->bc_private.b.ip->i_ino;
1062         else
1063                 owner = cur->bc_private.a.agno;
1064
1065         xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1066                                  xfs_btree_magic(cur), level, numrecs,
1067                                  owner, cur->bc_flags);
1068 }
1069
1070 /*
1071  * Return true if ptr is the last record in the btree and
1072  * we need to track updates to this record.  The decision
1073  * will be further refined in the update_lastrec method.
1074  */
1075 STATIC int
1076 xfs_btree_is_lastrec(
1077         struct xfs_btree_cur    *cur,
1078         struct xfs_btree_block  *block,
1079         int                     level)
1080 {
1081         union xfs_btree_ptr     ptr;
1082
1083         if (level > 0)
1084                 return 0;
1085         if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1086                 return 0;
1087
1088         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1089         if (!xfs_btree_ptr_is_null(cur, &ptr))
1090                 return 0;
1091         return 1;
1092 }
1093
1094 STATIC void
1095 xfs_btree_buf_to_ptr(
1096         struct xfs_btree_cur    *cur,
1097         struct xfs_buf          *bp,
1098         union xfs_btree_ptr     *ptr)
1099 {
1100         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1101                 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1102                                         XFS_BUF_ADDR(bp)));
1103         else {
1104                 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1105                                         XFS_BUF_ADDR(bp)));
1106         }
1107 }
1108
1109 STATIC void
1110 xfs_btree_set_refs(
1111         struct xfs_btree_cur    *cur,
1112         struct xfs_buf          *bp)
1113 {
1114         switch (cur->bc_btnum) {
1115         case XFS_BTNUM_BNO:
1116         case XFS_BTNUM_CNT:
1117                 xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1118                 break;
1119         case XFS_BTNUM_INO:
1120                 xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1121                 break;
1122         case XFS_BTNUM_BMAP:
1123                 xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1124                 break;
1125         default:
1126                 ASSERT(0);
1127         }
1128 }
1129
1130 STATIC int
1131 xfs_btree_get_buf_block(
1132         struct xfs_btree_cur    *cur,
1133         union xfs_btree_ptr     *ptr,
1134         int                     flags,
1135         struct xfs_btree_block  **block,
1136         struct xfs_buf          **bpp)
1137 {
1138         struct xfs_mount        *mp = cur->bc_mp;
1139         xfs_daddr_t             d;
1140
1141         /* need to sort out how callers deal with failures first */
1142         ASSERT(!(flags & XBF_TRYLOCK));
1143
1144         d = xfs_btree_ptr_to_daddr(cur, ptr);
1145         *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1146                                  mp->m_bsize, flags);
1147
1148         if (!*bpp)
1149                 return ENOMEM;
1150
1151         (*bpp)->b_ops = cur->bc_ops->buf_ops;
1152         *block = XFS_BUF_TO_BLOCK(*bpp);
1153         return 0;
1154 }
1155
1156 /*
1157  * Read in the buffer at the given ptr and return the buffer and
1158  * the block pointer within the buffer.
1159  */
1160 STATIC int
1161 xfs_btree_read_buf_block(
1162         struct xfs_btree_cur    *cur,
1163         union xfs_btree_ptr     *ptr,
1164         int                     level,
1165         int                     flags,
1166         struct xfs_btree_block  **block,
1167         struct xfs_buf          **bpp)
1168 {
1169         struct xfs_mount        *mp = cur->bc_mp;
1170         xfs_daddr_t             d;
1171         int                     error;
1172
1173         /* need to sort out how callers deal with failures first */
1174         ASSERT(!(flags & XBF_TRYLOCK));
1175
1176         d = xfs_btree_ptr_to_daddr(cur, ptr);
1177         error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1178                                    mp->m_bsize, flags, bpp,
1179                                    cur->bc_ops->buf_ops);
1180         if (error)
1181                 return error;
1182
1183         ASSERT(!xfs_buf_geterror(*bpp));
1184         xfs_btree_set_refs(cur, *bpp);
1185         *block = XFS_BUF_TO_BLOCK(*bpp);
1186         return 0;
1187 }
1188
1189 /*
1190  * Copy keys from one btree block to another.
1191  */
1192 STATIC void
1193 xfs_btree_copy_keys(
1194         struct xfs_btree_cur    *cur,
1195         union xfs_btree_key     *dst_key,
1196         union xfs_btree_key     *src_key,
1197         int                     numkeys)
1198 {
1199         ASSERT(numkeys >= 0);
1200         memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1201 }
1202
1203 /*
1204  * Copy records from one btree block to another.
1205  */
1206 STATIC void
1207 xfs_btree_copy_recs(
1208         struct xfs_btree_cur    *cur,
1209         union xfs_btree_rec     *dst_rec,
1210         union xfs_btree_rec     *src_rec,
1211         int                     numrecs)
1212 {
1213         ASSERT(numrecs >= 0);
1214         memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1215 }
1216
1217 /*
1218  * Copy block pointers from one btree block to another.
1219  */
1220 STATIC void
1221 xfs_btree_copy_ptrs(
1222         struct xfs_btree_cur    *cur,
1223         union xfs_btree_ptr     *dst_ptr,
1224         union xfs_btree_ptr     *src_ptr,
1225         int                     numptrs)
1226 {
1227         ASSERT(numptrs >= 0);
1228         memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1229 }
1230
1231 /*
1232  * Shift keys one index left/right inside a single btree block.
1233  */
1234 STATIC void
1235 xfs_btree_shift_keys(
1236         struct xfs_btree_cur    *cur,
1237         union xfs_btree_key     *key,
1238         int                     dir,
1239         int                     numkeys)
1240 {
1241         char                    *dst_key;
1242
1243         ASSERT(numkeys >= 0);
1244         ASSERT(dir == 1 || dir == -1);
1245
1246         dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1247         memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1248 }
1249
1250 /*
1251  * Shift records one index left/right inside a single btree block.
1252  */
1253 STATIC void
1254 xfs_btree_shift_recs(
1255         struct xfs_btree_cur    *cur,
1256         union xfs_btree_rec     *rec,
1257         int                     dir,
1258         int                     numrecs)
1259 {
1260         char                    *dst_rec;
1261
1262         ASSERT(numrecs >= 0);
1263         ASSERT(dir == 1 || dir == -1);
1264
1265         dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1266         memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1267 }
1268
1269 /*
1270  * Shift block pointers one index left/right inside a single btree block.
1271  */
1272 STATIC void
1273 xfs_btree_shift_ptrs(
1274         struct xfs_btree_cur    *cur,
1275         union xfs_btree_ptr     *ptr,
1276         int                     dir,
1277         int                     numptrs)
1278 {
1279         char                    *dst_ptr;
1280
1281         ASSERT(numptrs >= 0);
1282         ASSERT(dir == 1 || dir == -1);
1283
1284         dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1285         memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1286 }
1287
1288 /*
1289  * Log key values from the btree block.
1290  */
1291 STATIC void
1292 xfs_btree_log_keys(
1293         struct xfs_btree_cur    *cur,
1294         struct xfs_buf          *bp,
1295         int                     first,
1296         int                     last)
1297 {
1298         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1299         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1300
1301         if (bp) {
1302                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1303                 xfs_trans_log_buf(cur->bc_tp, bp,
1304                                   xfs_btree_key_offset(cur, first),
1305                                   xfs_btree_key_offset(cur, last + 1) - 1);
1306         } else {
1307                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1308                                 xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1309         }
1310
1311         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1312 }
1313
1314 /*
1315  * Log record values from the btree block.
1316  */
1317 void
1318 xfs_btree_log_recs(
1319         struct xfs_btree_cur    *cur,
1320         struct xfs_buf          *bp,
1321         int                     first,
1322         int                     last)
1323 {
1324         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1325         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1326
1327         xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1328         xfs_trans_log_buf(cur->bc_tp, bp,
1329                           xfs_btree_rec_offset(cur, first),
1330                           xfs_btree_rec_offset(cur, last + 1) - 1);
1331
1332         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1333 }
1334
1335 /*
1336  * Log block pointer fields from a btree block (nonleaf).
1337  */
1338 STATIC void
1339 xfs_btree_log_ptrs(
1340         struct xfs_btree_cur    *cur,   /* btree cursor */
1341         struct xfs_buf          *bp,    /* buffer containing btree block */
1342         int                     first,  /* index of first pointer to log */
1343         int                     last)   /* index of last pointer to log */
1344 {
1345         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1346         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1347
1348         if (bp) {
1349                 struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1350                 int                     level = xfs_btree_get_level(block);
1351
1352                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1353                 xfs_trans_log_buf(cur->bc_tp, bp,
1354                                 xfs_btree_ptr_offset(cur, first, level),
1355                                 xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1356         } else {
1357                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1358                         xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1359         }
1360
1361         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1362 }
1363
1364 /*
1365  * Log fields from a btree block header.
1366  */
1367 void
1368 xfs_btree_log_block(
1369         struct xfs_btree_cur    *cur,   /* btree cursor */
1370         struct xfs_buf          *bp,    /* buffer containing btree block */
1371         int                     fields) /* mask of fields: XFS_BB_... */
1372 {
1373         int                     first;  /* first byte offset logged */
1374         int                     last;   /* last byte offset logged */
1375         static const short      soffsets[] = {  /* table of offsets (short) */
1376                 offsetof(struct xfs_btree_block, bb_magic),
1377                 offsetof(struct xfs_btree_block, bb_level),
1378                 offsetof(struct xfs_btree_block, bb_numrecs),
1379                 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1380                 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1381                 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1382                 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1383                 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1384                 offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1385                 offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1386                 XFS_BTREE_SBLOCK_CRC_LEN
1387         };
1388         static const short      loffsets[] = {  /* table of offsets (long) */
1389                 offsetof(struct xfs_btree_block, bb_magic),
1390                 offsetof(struct xfs_btree_block, bb_level),
1391                 offsetof(struct xfs_btree_block, bb_numrecs),
1392                 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1393                 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1394                 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1395                 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1396                 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1397                 offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1398                 offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1399                 offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1400                 XFS_BTREE_LBLOCK_CRC_LEN
1401         };
1402
1403         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1404         XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
1405
1406         if (bp) {
1407                 int nbits;
1408
1409                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1410                         /*
1411                          * We don't log the CRC when updating a btree
1412                          * block but instead recreate it during log
1413                          * recovery.  As the log buffers have checksums
1414                          * of their own this is safe and avoids logging a crc
1415                          * update in a lot of places.
1416                          */
1417                         if (fields == XFS_BB_ALL_BITS)
1418                                 fields = XFS_BB_ALL_BITS_CRC;
1419                         nbits = XFS_BB_NUM_BITS_CRC;
1420                 } else {
1421                         nbits = XFS_BB_NUM_BITS;
1422                 }
1423                 xfs_btree_offsets(fields,
1424                                   (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1425                                         loffsets : soffsets,
1426                                   nbits, &first, &last);
1427                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1428                 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1429         } else {
1430                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1431                         xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1432         }
1433
1434         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1435 }
1436
1437 /*
1438  * Increment cursor by one record at the level.
1439  * For nonzero levels the leaf-ward information is untouched.
1440  */
1441 int                                             /* error */
1442 xfs_btree_increment(
1443         struct xfs_btree_cur    *cur,
1444         int                     level,
1445         int                     *stat)          /* success/failure */
1446 {
1447         struct xfs_btree_block  *block;
1448         union xfs_btree_ptr     ptr;
1449         struct xfs_buf          *bp;
1450         int                     error;          /* error return value */
1451         int                     lev;
1452
1453         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1454         XFS_BTREE_TRACE_ARGI(cur, level);
1455
1456         ASSERT(level < cur->bc_nlevels);
1457
1458         /* Read-ahead to the right at this level. */
1459         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1460
1461         /* Get a pointer to the btree block. */
1462         block = xfs_btree_get_block(cur, level, &bp);
1463
1464 #ifdef DEBUG
1465         error = xfs_btree_check_block(cur, block, level, bp);
1466         if (error)
1467                 goto error0;
1468 #endif
1469
1470         /* We're done if we remain in the block after the increment. */
1471         if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1472                 goto out1;
1473
1474         /* Fail if we just went off the right edge of the tree. */
1475         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1476         if (xfs_btree_ptr_is_null(cur, &ptr))
1477                 goto out0;
1478
1479         XFS_BTREE_STATS_INC(cur, increment);
1480
1481         /*
1482          * March up the tree incrementing pointers.
1483          * Stop when we don't go off the right edge of a block.
1484          */
1485         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1486                 block = xfs_btree_get_block(cur, lev, &bp);
1487
1488 #ifdef DEBUG
1489                 error = xfs_btree_check_block(cur, block, lev, bp);
1490                 if (error)
1491                         goto error0;
1492 #endif
1493
1494                 if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1495                         break;
1496
1497                 /* Read-ahead the right block for the next loop. */
1498                 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1499         }
1500
1501         /*
1502          * If we went off the root then we are either seriously
1503          * confused or have the tree root in an inode.
1504          */
1505         if (lev == cur->bc_nlevels) {
1506                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1507                         goto out0;
1508                 ASSERT(0);
1509                 error = EFSCORRUPTED;
1510                 goto error0;
1511         }
1512         ASSERT(lev < cur->bc_nlevels);
1513
1514         /*
1515          * Now walk back down the tree, fixing up the cursor's buffer
1516          * pointers and key numbers.
1517          */
1518         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1519                 union xfs_btree_ptr     *ptrp;
1520
1521                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1522                 error = xfs_btree_read_buf_block(cur, ptrp, --lev,
1523                                                         0, &block, &bp);
1524                 if (error)
1525                         goto error0;
1526
1527                 xfs_btree_setbuf(cur, lev, bp);
1528                 cur->bc_ptrs[lev] = 1;
1529         }
1530 out1:
1531         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1532         *stat = 1;
1533         return 0;
1534
1535 out0:
1536         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1537         *stat = 0;
1538         return 0;
1539
1540 error0:
1541         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1542         return error;
1543 }
1544
1545 /*
1546  * Decrement cursor by one record at the level.
1547  * For nonzero levels the leaf-ward information is untouched.
1548  */
1549 int                                             /* error */
1550 xfs_btree_decrement(
1551         struct xfs_btree_cur    *cur,
1552         int                     level,
1553         int                     *stat)          /* success/failure */
1554 {
1555         struct xfs_btree_block  *block;
1556         xfs_buf_t               *bp;
1557         int                     error;          /* error return value */
1558         int                     lev;
1559         union xfs_btree_ptr     ptr;
1560
1561         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1562         XFS_BTREE_TRACE_ARGI(cur, level);
1563
1564         ASSERT(level < cur->bc_nlevels);
1565
1566         /* Read-ahead to the left at this level. */
1567         xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1568
1569         /* We're done if we remain in the block after the decrement. */
1570         if (--cur->bc_ptrs[level] > 0)
1571                 goto out1;
1572
1573         /* Get a pointer to the btree block. */
1574         block = xfs_btree_get_block(cur, level, &bp);
1575
1576 #ifdef DEBUG
1577         error = xfs_btree_check_block(cur, block, level, bp);
1578         if (error)
1579                 goto error0;
1580 #endif
1581
1582         /* Fail if we just went off the left edge of the tree. */
1583         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1584         if (xfs_btree_ptr_is_null(cur, &ptr))
1585                 goto out0;
1586
1587         XFS_BTREE_STATS_INC(cur, decrement);
1588
1589         /*
1590          * March up the tree decrementing pointers.
1591          * Stop when we don't go off the left edge of a block.
1592          */
1593         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1594                 if (--cur->bc_ptrs[lev] > 0)
1595                         break;
1596                 /* Read-ahead the left block for the next loop. */
1597                 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1598         }
1599
1600         /*
1601          * If we went off the root then we are seriously confused.
1602          * or the root of the tree is in an inode.
1603          */
1604         if (lev == cur->bc_nlevels) {
1605                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1606                         goto out0;
1607                 ASSERT(0);
1608                 error = EFSCORRUPTED;
1609                 goto error0;
1610         }
1611         ASSERT(lev < cur->bc_nlevels);
1612
1613         /*
1614          * Now walk back down the tree, fixing up the cursor's buffer
1615          * pointers and key numbers.
1616          */
1617         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1618                 union xfs_btree_ptr     *ptrp;
1619
1620                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1621                 error = xfs_btree_read_buf_block(cur, ptrp, --lev,
1622                                                         0, &block, &bp);
1623                 if (error)
1624                         goto error0;
1625                 xfs_btree_setbuf(cur, lev, bp);
1626                 cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1627         }
1628 out1:
1629         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1630         *stat = 1;
1631         return 0;
1632
1633 out0:
1634         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1635         *stat = 0;
1636         return 0;
1637
1638 error0:
1639         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1640         return error;
1641 }
1642
1643 STATIC int
1644 xfs_btree_lookup_get_block(
1645         struct xfs_btree_cur    *cur,   /* btree cursor */
1646         int                     level,  /* level in the btree */
1647         union xfs_btree_ptr     *pp,    /* ptr to btree block */
1648         struct xfs_btree_block  **blkp) /* return btree block */
1649 {
1650         struct xfs_buf          *bp;    /* buffer pointer for btree block */
1651         int                     error = 0;
1652
1653         /* special case the root block if in an inode */
1654         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1655             (level == cur->bc_nlevels - 1)) {
1656                 *blkp = xfs_btree_get_iroot(cur);
1657                 return 0;
1658         }
1659
1660         /*
1661          * If the old buffer at this level for the disk address we are
1662          * looking for re-use it.
1663          *
1664          * Otherwise throw it away and get a new one.
1665          */
1666         bp = cur->bc_bufs[level];
1667         if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
1668                 *blkp = XFS_BUF_TO_BLOCK(bp);
1669                 return 0;
1670         }
1671
1672         error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp);
1673         if (error)
1674                 return error;
1675
1676         xfs_btree_setbuf(cur, level, bp);
1677         return 0;
1678 }
1679
1680 /*
1681  * Get current search key.  For level 0 we don't actually have a key
1682  * structure so we make one up from the record.  For all other levels
1683  * we just return the right key.
1684  */
1685 STATIC union xfs_btree_key *
1686 xfs_lookup_get_search_key(
1687         struct xfs_btree_cur    *cur,
1688         int                     level,
1689         int                     keyno,
1690         struct xfs_btree_block  *block,
1691         union xfs_btree_key     *kp)
1692 {
1693         if (level == 0) {
1694                 cur->bc_ops->init_key_from_rec(kp,
1695                                 xfs_btree_rec_addr(cur, keyno, block));
1696                 return kp;
1697         }
1698
1699         return xfs_btree_key_addr(cur, keyno, block);
1700 }
1701
1702 /*
1703  * Lookup the record.  The cursor is made to point to it, based on dir.
1704  * stat is set to 0 if can't find any such record, 1 for success.
1705  */
1706 int                                     /* error */
1707 xfs_btree_lookup(
1708         struct xfs_btree_cur    *cur,   /* btree cursor */
1709         xfs_lookup_t            dir,    /* <=, ==, or >= */
1710         int                     *stat)  /* success/failure */
1711 {
1712         struct xfs_btree_block  *block; /* current btree block */
1713         __int64_t               diff;   /* difference for the current key */
1714         int                     error;  /* error return value */
1715         int                     keyno;  /* current key number */
1716         int                     level;  /* level in the btree */
1717         union xfs_btree_ptr     *pp;    /* ptr to btree block */
1718         union xfs_btree_ptr     ptr;    /* ptr to btree block */
1719
1720         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1721         XFS_BTREE_TRACE_ARGI(cur, dir);
1722
1723         XFS_BTREE_STATS_INC(cur, lookup);
1724
1725         block = NULL;
1726         keyno = 0;
1727
1728         /* initialise start pointer from cursor */
1729         cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1730         pp = &ptr;
1731
1732         /*
1733          * Iterate over each level in the btree, starting at the root.
1734          * For each level above the leaves, find the key we need, based
1735          * on the lookup record, then follow the corresponding block
1736          * pointer down to the next level.
1737          */
1738         for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1739                 /* Get the block we need to do the lookup on. */
1740                 error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1741                 if (error)
1742                         goto error0;
1743
1744                 if (diff == 0) {
1745                         /*
1746                          * If we already had a key match at a higher level, we
1747                          * know we need to use the first entry in this block.
1748                          */
1749                         keyno = 1;
1750                 } else {
1751                         /* Otherwise search this block. Do a binary search. */
1752
1753                         int     high;   /* high entry number */
1754                         int     low;    /* low entry number */
1755
1756                         /* Set low and high entry numbers, 1-based. */
1757                         low = 1;
1758                         high = xfs_btree_get_numrecs(block);
1759                         if (!high) {
1760                                 /* Block is empty, must be an empty leaf. */
1761                                 ASSERT(level == 0 && cur->bc_nlevels == 1);
1762
1763                                 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1764                                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1765                                 *stat = 0;
1766                                 return 0;
1767                         }
1768
1769                         /* Binary search the block. */
1770                         while (low <= high) {
1771                                 union xfs_btree_key     key;
1772                                 union xfs_btree_key     *kp;
1773
1774                                 XFS_BTREE_STATS_INC(cur, compare);
1775
1776                                 /* keyno is average of low and high. */
1777                                 keyno = (low + high) >> 1;
1778
1779                                 /* Get current search key */
1780                                 kp = xfs_lookup_get_search_key(cur, level,
1781                                                 keyno, block, &key);
1782
1783                                 /*
1784                                  * Compute difference to get next direction:
1785                                  *  - less than, move right
1786                                  *  - greater than, move left
1787                                  *  - equal, we're done
1788                                  */
1789                                 diff = cur->bc_ops->key_diff(cur, kp);
1790                                 if (diff < 0)
1791                                         low = keyno + 1;
1792                                 else if (diff > 0)
1793                                         high = keyno - 1;
1794                                 else
1795                                         break;
1796                         }
1797                 }
1798
1799                 /*
1800                  * If there are more levels, set up for the next level
1801                  * by getting the block number and filling in the cursor.
1802                  */
1803                 if (level > 0) {
1804                         /*
1805                          * If we moved left, need the previous key number,
1806                          * unless there isn't one.
1807                          */
1808                         if (diff > 0 && --keyno < 1)
1809                                 keyno = 1;
1810                         pp = xfs_btree_ptr_addr(cur, keyno, block);
1811
1812 #ifdef DEBUG
1813                         error = xfs_btree_check_ptr(cur, pp, 0, level);
1814                         if (error)
1815                                 goto error0;
1816 #endif
1817                         cur->bc_ptrs[level] = keyno;
1818                 }
1819         }
1820
1821         /* Done with the search. See if we need to adjust the results. */
1822         if (dir != XFS_LOOKUP_LE && diff < 0) {
1823                 keyno++;
1824                 /*
1825                  * If ge search and we went off the end of the block, but it's
1826                  * not the last block, we're in the wrong block.
1827                  */
1828                 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1829                 if (dir == XFS_LOOKUP_GE &&
1830                     keyno > xfs_btree_get_numrecs(block) &&
1831                     !xfs_btree_ptr_is_null(cur, &ptr)) {
1832                         int     i;
1833
1834                         cur->bc_ptrs[0] = keyno;
1835                         error = xfs_btree_increment(cur, 0, &i);
1836                         if (error)
1837                                 goto error0;
1838                         XFS_WANT_CORRUPTED_RETURN(i == 1);
1839                         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1840                         *stat = 1;
1841                         return 0;
1842                 }
1843         } else if (dir == XFS_LOOKUP_LE && diff > 0)
1844                 keyno--;
1845         cur->bc_ptrs[0] = keyno;
1846
1847         /* Return if we succeeded or not. */
1848         if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1849                 *stat = 0;
1850         else if (dir != XFS_LOOKUP_EQ || diff == 0)
1851                 *stat = 1;
1852         else
1853                 *stat = 0;
1854         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1855         return 0;
1856
1857 error0:
1858         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1859         return error;
1860 }
1861
1862 /*
1863  * Update keys at all levels from here to the root along the cursor's path.
1864  */
1865 STATIC int
1866 xfs_btree_updkey(
1867         struct xfs_btree_cur    *cur,
1868         union xfs_btree_key     *keyp,
1869         int                     level)
1870 {
1871         struct xfs_btree_block  *block;
1872         struct xfs_buf          *bp;
1873         union xfs_btree_key     *kp;
1874         int                     ptr;
1875
1876         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1877         XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
1878
1879         ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
1880
1881         /*
1882          * Go up the tree from this level toward the root.
1883          * At each level, update the key value to the value input.
1884          * Stop when we reach a level where the cursor isn't pointing
1885          * at the first entry in the block.
1886          */
1887         for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
1888 #ifdef DEBUG
1889                 int             error;
1890 #endif
1891                 block = xfs_btree_get_block(cur, level, &bp);
1892 #ifdef DEBUG
1893                 error = xfs_btree_check_block(cur, block, level, bp);
1894                 if (error) {
1895                         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1896                         return error;
1897                 }
1898 #endif
1899                 ptr = cur->bc_ptrs[level];
1900                 kp = xfs_btree_key_addr(cur, ptr, block);
1901                 xfs_btree_copy_keys(cur, kp, keyp, 1);
1902                 xfs_btree_log_keys(cur, bp, ptr, ptr);
1903         }
1904
1905         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1906         return 0;
1907 }
1908
1909 /*
1910  * Update the record referred to by cur to the value in the
1911  * given record. This either works (return 0) or gets an
1912  * EFSCORRUPTED error.
1913  */
1914 int
1915 xfs_btree_update(
1916         struct xfs_btree_cur    *cur,
1917         union xfs_btree_rec     *rec)
1918 {
1919         struct xfs_btree_block  *block;
1920         struct xfs_buf          *bp;
1921         int                     error;
1922         int                     ptr;
1923         union xfs_btree_rec     *rp;
1924
1925         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1926         XFS_BTREE_TRACE_ARGR(cur, rec);
1927
1928         /* Pick up the current block. */
1929         block = xfs_btree_get_block(cur, 0, &bp);
1930
1931 #ifdef DEBUG
1932         error = xfs_btree_check_block(cur, block, 0, bp);
1933         if (error)
1934                 goto error0;
1935 #endif
1936         /* Get the address of the rec to be updated. */
1937         ptr = cur->bc_ptrs[0];
1938         rp = xfs_btree_rec_addr(cur, ptr, block);
1939
1940         /* Fill in the new contents and log them. */
1941         xfs_btree_copy_recs(cur, rp, rec, 1);
1942         xfs_btree_log_recs(cur, bp, ptr, ptr);
1943
1944         /*
1945          * If we are tracking the last record in the tree and
1946          * we are at the far right edge of the tree, update it.
1947          */
1948         if (xfs_btree_is_lastrec(cur, block, 0)) {
1949                 cur->bc_ops->update_lastrec(cur, block, rec,
1950                                             ptr, LASTREC_UPDATE);
1951         }
1952
1953         /* Updating first rec in leaf. Pass new key value up to our parent. */
1954         if (ptr == 1) {
1955                 union xfs_btree_key     key;
1956
1957                 cur->bc_ops->init_key_from_rec(&key, rec);
1958                 error = xfs_btree_updkey(cur, &key, 1);
1959                 if (error)
1960                         goto error0;
1961         }
1962
1963         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1964         return 0;
1965
1966 error0:
1967         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1968         return error;
1969 }
1970
1971 /*
1972  * Move 1 record left from cur/level if possible.
1973  * Update cur to reflect the new path.
1974  */
1975 STATIC int                                      /* error */
1976 xfs_btree_lshift(
1977         struct xfs_btree_cur    *cur,
1978         int                     level,
1979         int                     *stat)          /* success/failure */
1980 {
1981         union xfs_btree_key     key;            /* btree key */
1982         struct xfs_buf          *lbp;           /* left buffer pointer */
1983         struct xfs_btree_block  *left;          /* left btree block */
1984         int                     lrecs;          /* left record count */
1985         struct xfs_buf          *rbp;           /* right buffer pointer */
1986         struct xfs_btree_block  *right;         /* right btree block */
1987         int                     rrecs;          /* right record count */
1988         union xfs_btree_ptr     lptr;           /* left btree pointer */
1989         union xfs_btree_key     *rkp = NULL;    /* right btree key */
1990         union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
1991         union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
1992         int                     error;          /* error return value */
1993
1994         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1995         XFS_BTREE_TRACE_ARGI(cur, level);
1996
1997         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1998             level == cur->bc_nlevels - 1)
1999                 goto out0;
2000
2001         /* Set up variables for this block as "right". */
2002         right = xfs_btree_get_block(cur, level, &rbp);
2003
2004 #ifdef DEBUG
2005         error = xfs_btree_check_block(cur, right, level, rbp);
2006         if (error)
2007                 goto error0;
2008 #endif
2009
2010         /* If we've got no left sibling then we can't shift an entry left. */
2011         xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2012         if (xfs_btree_ptr_is_null(cur, &lptr))
2013                 goto out0;
2014
2015         /*
2016          * If the cursor entry is the one that would be moved, don't
2017          * do it... it's too complicated.
2018          */
2019         if (cur->bc_ptrs[level] <= 1)
2020                 goto out0;
2021
2022         /* Set up the left neighbor as "left". */
2023         error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp);
2024         if (error)
2025                 goto error0;
2026
2027         /* If it's full, it can't take another entry. */
2028         lrecs = xfs_btree_get_numrecs(left);
2029         if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2030                 goto out0;
2031
2032         rrecs = xfs_btree_get_numrecs(right);
2033
2034         /*
2035          * We add one entry to the left side and remove one for the right side.
2036          * Account for it here, the changes will be updated on disk and logged
2037          * later.
2038          */
2039         lrecs++;
2040         rrecs--;
2041
2042         XFS_BTREE_STATS_INC(cur, lshift);
2043         XFS_BTREE_STATS_ADD(cur, moves, 1);
2044
2045         /*
2046          * If non-leaf, copy a key and a ptr to the left block.
2047          * Log the changes to the left block.
2048          */
2049         if (level > 0) {
2050                 /* It's a non-leaf.  Move keys and pointers. */
2051                 union xfs_btree_key     *lkp;   /* left btree key */
2052                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2053
2054                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2055                 rkp = xfs_btree_key_addr(cur, 1, right);
2056
2057                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2058                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2059 #ifdef DEBUG
2060                 error = xfs_btree_check_ptr(cur, rpp, 0, level);
2061                 if (error)
2062                         goto error0;
2063 #endif
2064                 xfs_btree_copy_keys(cur, lkp, rkp, 1);
2065                 xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2066
2067                 xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2068                 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2069
2070                 ASSERT(cur->bc_ops->keys_inorder(cur,
2071                         xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2072         } else {
2073                 /* It's a leaf.  Move records.  */
2074                 union xfs_btree_rec     *lrp;   /* left record pointer */
2075
2076                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2077                 rrp = xfs_btree_rec_addr(cur, 1, right);
2078
2079                 xfs_btree_copy_recs(cur, lrp, rrp, 1);
2080                 xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2081
2082                 ASSERT(cur->bc_ops->recs_inorder(cur,
2083                         xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2084         }
2085
2086         xfs_btree_set_numrecs(left, lrecs);
2087         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2088
2089         xfs_btree_set_numrecs(right, rrecs);
2090         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2091
2092         /*
2093          * Slide the contents of right down one entry.
2094          */
2095         XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2096         if (level > 0) {
2097                 /* It's a nonleaf. operate on keys and ptrs */
2098 #ifdef DEBUG
2099                 int                     i;              /* loop index */
2100
2101                 for (i = 0; i < rrecs; i++) {
2102                         error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
2103                         if (error)
2104                                 goto error0;
2105                 }
2106 #endif
2107                 xfs_btree_shift_keys(cur,
2108                                 xfs_btree_key_addr(cur, 2, right),
2109                                 -1, rrecs);
2110                 xfs_btree_shift_ptrs(cur,
2111                                 xfs_btree_ptr_addr(cur, 2, right),
2112                                 -1, rrecs);
2113
2114                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2115                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2116         } else {
2117                 /* It's a leaf. operate on records */
2118                 xfs_btree_shift_recs(cur,
2119                         xfs_btree_rec_addr(cur, 2, right),
2120                         -1, rrecs);
2121                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2122
2123                 /*
2124                  * If it's the first record in the block, we'll need a key
2125                  * structure to pass up to the next level (updkey).
2126                  */
2127                 cur->bc_ops->init_key_from_rec(&key,
2128                         xfs_btree_rec_addr(cur, 1, right));
2129                 rkp = &key;
2130         }
2131
2132         /* Update the parent key values of right. */
2133         error = xfs_btree_updkey(cur, rkp, level + 1);
2134         if (error)
2135                 goto error0;
2136
2137         /* Slide the cursor value left one. */
2138         cur->bc_ptrs[level]--;
2139
2140         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2141         *stat = 1;
2142         return 0;
2143
2144 out0:
2145         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2146         *stat = 0;
2147         return 0;
2148
2149 error0:
2150         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2151         return error;
2152 }
2153
2154 /*
2155  * Move 1 record right from cur/level if possible.
2156  * Update cur to reflect the new path.
2157  */
2158 STATIC int                                      /* error */
2159 xfs_btree_rshift(
2160         struct xfs_btree_cur    *cur,
2161         int                     level,
2162         int                     *stat)          /* success/failure */
2163 {
2164         union xfs_btree_key     key;            /* btree key */
2165         struct xfs_buf          *lbp;           /* left buffer pointer */
2166         struct xfs_btree_block  *left;          /* left btree block */
2167         struct xfs_buf          *rbp;           /* right buffer pointer */
2168         struct xfs_btree_block  *right;         /* right btree block */
2169         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2170         union xfs_btree_ptr     rptr;           /* right block pointer */
2171         union xfs_btree_key     *rkp;           /* right btree key */
2172         int                     rrecs;          /* right record count */
2173         int                     lrecs;          /* left record count */
2174         int                     error;          /* error return value */
2175         int                     i;              /* loop counter */
2176
2177         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2178         XFS_BTREE_TRACE_ARGI(cur, level);
2179
2180         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2181             (level == cur->bc_nlevels - 1))
2182                 goto out0;
2183
2184         /* Set up variables for this block as "left". */
2185         left = xfs_btree_get_block(cur, level, &lbp);
2186
2187 #ifdef DEBUG
2188         error = xfs_btree_check_block(cur, left, level, lbp);
2189         if (error)
2190                 goto error0;
2191 #endif
2192
2193         /* If we've got no right sibling then we can't shift an entry right. */
2194         xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2195         if (xfs_btree_ptr_is_null(cur, &rptr))
2196                 goto out0;
2197
2198         /*
2199          * If the cursor entry is the one that would be moved, don't
2200          * do it... it's too complicated.
2201          */
2202         lrecs = xfs_btree_get_numrecs(left);
2203         if (cur->bc_ptrs[level] >= lrecs)
2204                 goto out0;
2205
2206         /* Set up the right neighbor as "right". */
2207         error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp);
2208         if (error)
2209                 goto error0;
2210
2211         /* If it's full, it can't take another entry. */
2212         rrecs = xfs_btree_get_numrecs(right);
2213         if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2214                 goto out0;
2215
2216         XFS_BTREE_STATS_INC(cur, rshift);
2217         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2218
2219         /*
2220          * Make a hole at the start of the right neighbor block, then
2221          * copy the last left block entry to the hole.
2222          */
2223         if (level > 0) {
2224                 /* It's a nonleaf. make a hole in the keys and ptrs */
2225                 union xfs_btree_key     *lkp;
2226                 union xfs_btree_ptr     *lpp;
2227                 union xfs_btree_ptr     *rpp;
2228
2229                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2230                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2231                 rkp = xfs_btree_key_addr(cur, 1, right);
2232                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2233
2234 #ifdef DEBUG
2235                 for (i = rrecs - 1; i >= 0; i--) {
2236                         error = xfs_btree_check_ptr(cur, rpp, i, level);
2237                         if (error)
2238                                 goto error0;
2239                 }
2240 #endif
2241
2242                 xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2243                 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2244
2245 #ifdef DEBUG
2246                 error = xfs_btree_check_ptr(cur, lpp, 0, level);
2247                 if (error)
2248                         goto error0;
2249 #endif
2250
2251                 /* Now put the new data in, and log it. */
2252                 xfs_btree_copy_keys(cur, rkp, lkp, 1);
2253                 xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2254
2255                 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2256                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2257
2258                 ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2259                         xfs_btree_key_addr(cur, 2, right)));
2260         } else {
2261                 /* It's a leaf. make a hole in the records */
2262                 union xfs_btree_rec     *lrp;
2263                 union xfs_btree_rec     *rrp;
2264
2265                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2266                 rrp = xfs_btree_rec_addr(cur, 1, right);
2267
2268                 xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2269
2270                 /* Now put the new data in, and log it. */
2271                 xfs_btree_copy_recs(cur, rrp, lrp, 1);
2272                 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2273
2274                 cur->bc_ops->init_key_from_rec(&key, rrp);
2275                 rkp = &key;
2276
2277                 ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
2278                         xfs_btree_rec_addr(cur, 2, right)));
2279         }
2280
2281         /*
2282          * Decrement and log left's numrecs, bump and log right's numrecs.
2283          */
2284         xfs_btree_set_numrecs(left, --lrecs);
2285         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2286
2287         xfs_btree_set_numrecs(right, ++rrecs);
2288         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2289
2290         /*
2291          * Using a temporary cursor, update the parent key values of the
2292          * block on the right.
2293          */
2294         error = xfs_btree_dup_cursor(cur, &tcur);
2295         if (error)
2296                 goto error0;
2297         i = xfs_btree_lastrec(tcur, level);
2298         XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
2299
2300         error = xfs_btree_increment(tcur, level, &i);
2301         if (error)
2302                 goto error1;
2303
2304         error = xfs_btree_updkey(tcur, rkp, level + 1);
2305         if (error)
2306                 goto error1;
2307
2308         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2309
2310         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2311         *stat = 1;
2312         return 0;
2313
2314 out0:
2315         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2316         *stat = 0;
2317         return 0;
2318
2319 error0:
2320         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2321         return error;
2322
2323 error1:
2324         XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
2325         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2326         return error;
2327 }
2328
2329 /*
2330  * Split cur/level block in half.
2331  * Return new block number and the key to its first
2332  * record (to be inserted into parent).
2333  */
2334 STATIC int                                      /* error */
2335 xfs_btree_split(
2336         struct xfs_btree_cur    *cur,
2337         int                     level,
2338         union xfs_btree_ptr     *ptrp,
2339         union xfs_btree_key     *key,
2340         struct xfs_btree_cur    **curp,
2341         int                     *stat)          /* success/failure */
2342 {
2343         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2344         struct xfs_buf          *lbp;           /* left buffer pointer */
2345         struct xfs_btree_block  *left;          /* left btree block */
2346         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2347         struct xfs_buf          *rbp;           /* right buffer pointer */
2348         struct xfs_btree_block  *right;         /* right btree block */
2349         union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2350         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2351         struct xfs_btree_block  *rrblock;       /* right-right btree block */
2352         int                     lrecs;
2353         int                     rrecs;
2354         int                     src_index;
2355         int                     error;          /* error return value */
2356 #ifdef DEBUG
2357         int                     i;
2358 #endif
2359
2360         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2361         XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
2362
2363         XFS_BTREE_STATS_INC(cur, split);
2364
2365         /* Set up left block (current one). */
2366         left = xfs_btree_get_block(cur, level, &lbp);
2367
2368 #ifdef DEBUG
2369         error = xfs_btree_check_block(cur, left, level, lbp);
2370         if (error)
2371                 goto error0;
2372 #endif
2373
2374         xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2375
2376         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2377         error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat);
2378         if (error)
2379                 goto error0;
2380         if (*stat == 0)
2381                 goto out0;
2382         XFS_BTREE_STATS_INC(cur, alloc);
2383
2384         /* Set up the new block as "right". */
2385         error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
2386         if (error)
2387                 goto error0;
2388
2389         /* Fill in the btree header for the new right block. */
2390         xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2391
2392         /*
2393          * Split the entries between the old and the new block evenly.
2394          * Make sure that if there's an odd number of entries now, that
2395          * each new block will have the same number of entries.
2396          */
2397         lrecs = xfs_btree_get_numrecs(left);
2398         rrecs = lrecs / 2;
2399         if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2400                 rrecs++;
2401         src_index = (lrecs - rrecs + 1);
2402
2403         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2404
2405         /*
2406          * Copy btree block entries from the left block over to the
2407          * new block, the right. Update the right block and log the
2408          * changes.
2409          */
2410         if (level > 0) {
2411                 /* It's a non-leaf.  Move keys and pointers. */
2412                 union xfs_btree_key     *lkp;   /* left btree key */
2413                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2414                 union xfs_btree_key     *rkp;   /* right btree key */
2415                 union xfs_btree_ptr     *rpp;   /* right address pointer */
2416
2417                 lkp = xfs_btree_key_addr(cur, src_index, left);
2418                 lpp = xfs_btree_ptr_addr(cur, src_index, left);
2419                 rkp = xfs_btree_key_addr(cur, 1, right);
2420                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2421
2422 #ifdef DEBUG
2423                 for (i = src_index; i < rrecs; i++) {
2424                         error = xfs_btree_check_ptr(cur, lpp, i, level);
2425                         if (error)
2426                                 goto error0;
2427                 }
2428 #endif
2429
2430                 xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2431                 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2432
2433                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2434                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2435
2436                 /* Grab the keys to the entries moved to the right block */
2437                 xfs_btree_copy_keys(cur, key, rkp, 1);
2438         } else {
2439                 /* It's a leaf.  Move records.  */
2440                 union xfs_btree_rec     *lrp;   /* left record pointer */
2441                 union xfs_btree_rec     *rrp;   /* right record pointer */
2442
2443                 lrp = xfs_btree_rec_addr(cur, src_index, left);
2444                 rrp = xfs_btree_rec_addr(cur, 1, right);
2445
2446                 xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2447                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2448
2449                 cur->bc_ops->init_key_from_rec(key,
2450                         xfs_btree_rec_addr(cur, 1, right));
2451         }
2452
2453
2454         /*
2455          * Find the left block number by looking in the buffer.
2456          * Adjust numrecs, sibling pointers.
2457          */
2458         xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2459         xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2460         xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2461         xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2462
2463         lrecs -= rrecs;
2464         xfs_btree_set_numrecs(left, lrecs);
2465         xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2466
2467         xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2468         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2469
2470         /*
2471          * If there's a block to the new block's right, make that block
2472          * point back to right instead of to left.
2473          */
2474         if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2475                 error = xfs_btree_read_buf_block(cur, &rrptr, level,
2476                                                         0, &rrblock, &rrbp);
2477                 if (error)
2478                         goto error0;
2479                 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2480                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2481         }
2482         /*
2483          * If the cursor is really in the right block, move it there.
2484          * If it's just pointing past the last entry in left, then we'll
2485          * insert there, so don't change anything in that case.
2486          */
2487         if (cur->bc_ptrs[level] > lrecs + 1) {
2488                 xfs_btree_setbuf(cur, level, rbp);
2489                 cur->bc_ptrs[level] -= lrecs;
2490         }
2491         /*
2492          * If there are more levels, we'll need another cursor which refers
2493          * the right block, no matter where this cursor was.
2494          */
2495         if (level + 1 < cur->bc_nlevels) {
2496                 error = xfs_btree_dup_cursor(cur, curp);
2497                 if (error)
2498                         goto error0;
2499                 (*curp)->bc_ptrs[level + 1]++;
2500         }
2501         *ptrp = rptr;
2502         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2503         *stat = 1;
2504         return 0;
2505 out0:
2506         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2507         *stat = 0;
2508         return 0;
2509
2510 error0:
2511         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2512         return error;
2513 }
2514
2515 /*
2516  * Copy the old inode root contents into a real block and make the
2517  * broot point to it.
2518  */
2519 int                                             /* error */
2520 xfs_btree_new_iroot(
2521         struct xfs_btree_cur    *cur,           /* btree cursor */
2522         int                     *logflags,      /* logging flags for inode */
2523         int                     *stat)          /* return status - 0 fail */
2524 {
2525         struct xfs_buf          *cbp;           /* buffer for cblock */
2526         struct xfs_btree_block  *block;         /* btree block */
2527         struct xfs_btree_block  *cblock;        /* child btree block */
2528         union xfs_btree_key     *ckp;           /* child key pointer */
2529         union xfs_btree_ptr     *cpp;           /* child ptr pointer */
2530         union xfs_btree_key     *kp;            /* pointer to btree key */
2531         union xfs_btree_ptr     *pp;            /* pointer to block addr */
2532         union xfs_btree_ptr     nptr;           /* new block addr */
2533         int                     level;          /* btree level */
2534         int                     error;          /* error return code */
2535 #ifdef DEBUG
2536         int                     i;              /* loop counter */
2537 #endif
2538
2539         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2540         XFS_BTREE_STATS_INC(cur, newroot);
2541
2542         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2543
2544         level = cur->bc_nlevels - 1;
2545
2546         block = xfs_btree_get_iroot(cur);
2547         pp = xfs_btree_ptr_addr(cur, 1, block);
2548
2549         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2550         error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat);
2551         if (error)
2552                 goto error0;
2553         if (*stat == 0) {
2554                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2555                 return 0;
2556         }
2557         XFS_BTREE_STATS_INC(cur, alloc);
2558
2559         /* Copy the root into a real block. */
2560         error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
2561         if (error)
2562                 goto error0;
2563
2564         /*
2565          * we can't just memcpy() the root in for CRC enabled btree blocks.
2566          * In that case have to also ensure the blkno remains correct
2567          */
2568         memcpy(cblock, block, xfs_btree_block_len(cur));
2569         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2570                 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2571                         cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2572                 else
2573                         cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2574         }
2575
2576         be16_add_cpu(&block->bb_level, 1);
2577         xfs_btree_set_numrecs(block, 1);
2578         cur->bc_nlevels++;
2579         cur->bc_ptrs[level + 1] = 1;
2580
2581         kp = xfs_btree_key_addr(cur, 1, block);
2582         ckp = xfs_btree_key_addr(cur, 1, cblock);
2583         xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2584
2585         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2586 #ifdef DEBUG
2587         for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2588                 error = xfs_btree_check_ptr(cur, pp, i, level);
2589                 if (error)
2590                         goto error0;
2591         }
2592 #endif
2593         xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2594
2595 #ifdef DEBUG
2596         error = xfs_btree_check_ptr(cur, &nptr, 0, level);
2597         if (error)
2598                 goto error0;
2599 #endif
2600         xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2601
2602         xfs_iroot_realloc(cur->bc_private.b.ip,
2603                           1 - xfs_btree_get_numrecs(cblock),
2604                           cur->bc_private.b.whichfork);
2605
2606         xfs_btree_setbuf(cur, level, cbp);
2607
2608         /*
2609          * Do all this logging at the end so that
2610          * the root is at the right level.
2611          */
2612         xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
2613         xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2614         xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2615
2616         *logflags |=
2617                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
2618         *stat = 1;
2619         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2620         return 0;
2621 error0:
2622         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2623         return error;
2624 }
2625
2626 /*
2627  * Allocate a new root block, fill it in.
2628  */
2629 STATIC int                              /* error */
2630 xfs_btree_new_root(
2631         struct xfs_btree_cur    *cur,   /* btree cursor */
2632         int                     *stat)  /* success/failure */
2633 {
2634         struct xfs_btree_block  *block; /* one half of the old root block */
2635         struct xfs_buf          *bp;    /* buffer containing block */
2636         int                     error;  /* error return value */
2637         struct xfs_buf          *lbp;   /* left buffer pointer */
2638         struct xfs_btree_block  *left;  /* left btree block */
2639         struct xfs_buf          *nbp;   /* new (root) buffer */
2640         struct xfs_btree_block  *new;   /* new (root) btree block */
2641         int                     nptr;   /* new value for key index, 1 or 2 */
2642         struct xfs_buf          *rbp;   /* right buffer pointer */
2643         struct xfs_btree_block  *right; /* right btree block */
2644         union xfs_btree_ptr     rptr;
2645         union xfs_btree_ptr     lptr;
2646
2647         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2648         XFS_BTREE_STATS_INC(cur, newroot);
2649
2650         /* initialise our start point from the cursor */
2651         cur->bc_ops->init_ptr_from_cur(cur, &rptr);
2652
2653         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2654         error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat);
2655         if (error)
2656                 goto error0;
2657         if (*stat == 0)
2658                 goto out0;
2659         XFS_BTREE_STATS_INC(cur, alloc);
2660
2661         /* Set up the new block. */
2662         error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
2663         if (error)
2664                 goto error0;
2665
2666         /* Set the root in the holding structure  increasing the level by 1. */
2667         cur->bc_ops->set_root(cur, &lptr, 1);
2668
2669         /*
2670          * At the previous root level there are now two blocks: the old root,
2671          * and the new block generated when it was split.  We don't know which
2672          * one the cursor is pointing at, so we set up variables "left" and
2673          * "right" for each case.
2674          */
2675         block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
2676
2677 #ifdef DEBUG
2678         error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
2679         if (error)
2680                 goto error0;
2681 #endif
2682
2683         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
2684         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
2685                 /* Our block is left, pick up the right block. */
2686                 lbp = bp;
2687                 xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2688                 left = block;
2689                 error = xfs_btree_read_buf_block(cur, &rptr,
2690                                         cur->bc_nlevels - 1, 0, &right, &rbp);
2691                 if (error)
2692                         goto error0;
2693                 bp = rbp;
2694                 nptr = 1;
2695         } else {
2696                 /* Our block is right, pick up the left block. */
2697                 rbp = bp;
2698                 xfs_btree_buf_to_ptr(cur, rbp, &rptr);
2699                 right = block;
2700                 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2701                 error = xfs_btree_read_buf_block(cur, &lptr,
2702                                         cur->bc_nlevels - 1, 0, &left, &lbp);
2703                 if (error)
2704                         goto error0;
2705                 bp = lbp;
2706                 nptr = 2;
2707         }
2708         /* Fill in the new block's btree header and log it. */
2709         xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
2710         xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
2711         ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
2712                         !xfs_btree_ptr_is_null(cur, &rptr));
2713
2714         /* Fill in the key data in the new root. */
2715         if (xfs_btree_get_level(left) > 0) {
2716                 xfs_btree_copy_keys(cur,
2717                                 xfs_btree_key_addr(cur, 1, new),
2718                                 xfs_btree_key_addr(cur, 1, left), 1);
2719                 xfs_btree_copy_keys(cur,
2720                                 xfs_btree_key_addr(cur, 2, new),
2721                                 xfs_btree_key_addr(cur, 1, right), 1);
2722         } else {
2723                 cur->bc_ops->init_key_from_rec(
2724                                 xfs_btree_key_addr(cur, 1, new),
2725                                 xfs_btree_rec_addr(cur, 1, left));
2726                 cur->bc_ops->init_key_from_rec(
2727                                 xfs_btree_key_addr(cur, 2, new),
2728                                 xfs_btree_rec_addr(cur, 1, right));
2729         }
2730         xfs_btree_log_keys(cur, nbp, 1, 2);
2731
2732         /* Fill in the pointer data in the new root. */
2733         xfs_btree_copy_ptrs(cur,
2734                 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
2735         xfs_btree_copy_ptrs(cur,
2736                 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
2737         xfs_btree_log_ptrs(cur, nbp, 1, 2);
2738
2739         /* Fix up the cursor. */
2740         xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
2741         cur->bc_ptrs[cur->bc_nlevels] = nptr;
2742         cur->bc_nlevels++;
2743         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2744         *stat = 1;
2745         return 0;
2746 error0:
2747         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2748         return error;
2749 out0:
2750         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2751         *stat = 0;
2752         return 0;
2753 }
2754
2755 STATIC int
2756 xfs_btree_make_block_unfull(
2757         struct xfs_btree_cur    *cur,   /* btree cursor */
2758         int                     level,  /* btree level */
2759         int                     numrecs,/* # of recs in block */
2760         int                     *oindex,/* old tree index */
2761         int                     *index, /* new tree index */
2762         union xfs_btree_ptr     *nptr,  /* new btree ptr */
2763         struct xfs_btree_cur    **ncur, /* new btree cursor */
2764         union xfs_btree_rec     *nrec,  /* new record */
2765         int                     *stat)
2766 {
2767         union xfs_btree_key     key;    /* new btree key value */
2768         int                     error = 0;
2769
2770         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2771             level == cur->bc_nlevels - 1) {
2772                 struct xfs_inode *ip = cur->bc_private.b.ip;
2773
2774                 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
2775                         /* A root block that can be made bigger. */
2776                         xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
2777                 } else {
2778                         /* A root block that needs replacing */
2779                         int     logflags = 0;
2780
2781                         error = xfs_btree_new_iroot(cur, &logflags, stat);
2782                         if (error || *stat == 0)
2783                                 return error;
2784
2785                         xfs_trans_log_inode(cur->bc_tp, ip, logflags);
2786                 }
2787
2788                 return 0;
2789         }
2790
2791         /* First, try shifting an entry to the right neighbor. */
2792         error = xfs_btree_rshift(cur, level, stat);
2793         if (error || *stat)
2794                 return error;
2795
2796         /* Next, try shifting an entry to the left neighbor. */
2797         error = xfs_btree_lshift(cur, level, stat);
2798         if (error)
2799                 return error;
2800
2801         if (*stat) {
2802                 *oindex = *index = cur->bc_ptrs[level];
2803                 return 0;
2804         }
2805
2806         /*
2807          * Next, try splitting the current block in half.
2808          *
2809          * If this works we have to re-set our variables because we
2810          * could be in a different block now.
2811          */
2812         error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
2813         if (error || *stat == 0)
2814                 return error;
2815
2816
2817         *index = cur->bc_ptrs[level];
2818         cur->bc_ops->init_rec_from_key(&key, nrec);
2819         return 0;
2820 }
2821
2822 /*
2823  * Insert one record/level.  Return information to the caller
2824  * allowing the next level up to proceed if necessary.
2825  */
2826 STATIC int
2827 xfs_btree_insrec(
2828         struct xfs_btree_cur    *cur,   /* btree cursor */
2829         int                     level,  /* level to insert record at */
2830         union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
2831         union xfs_btree_rec     *recp,  /* i/o: record data inserted */
2832         struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
2833         int                     *stat)  /* success/failure */
2834 {
2835         struct xfs_btree_block  *block; /* btree block */
2836         struct xfs_buf          *bp;    /* buffer for block */
2837         union xfs_btree_key     key;    /* btree key */
2838         union xfs_btree_ptr     nptr;   /* new block ptr */
2839         struct xfs_btree_cur    *ncur;  /* new btree cursor */
2840         union xfs_btree_rec     nrec;   /* new record count */
2841         int                     optr;   /* old key/record index */
2842         int                     ptr;    /* key/record index */
2843         int                     numrecs;/* number of records */
2844         int                     error;  /* error return value */
2845 #ifdef DEBUG
2846         int                     i;
2847 #endif
2848
2849         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2850         XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
2851
2852         ncur = NULL;
2853
2854         /*
2855          * If we have an external root pointer, and we've made it to the
2856          * root level, allocate a new root block and we're done.
2857          */
2858         if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2859             (level >= cur->bc_nlevels)) {
2860                 error = xfs_btree_new_root(cur, stat);
2861                 xfs_btree_set_ptr_null(cur, ptrp);
2862
2863                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2864                 return error;
2865         }
2866
2867         /* If we're off the left edge, return failure. */
2868         ptr = cur->bc_ptrs[level];
2869         if (ptr == 0) {
2870                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2871                 *stat = 0;
2872                 return 0;
2873         }
2874
2875         /* Make a key out of the record data to be inserted, and save it. */
2876         cur->bc_ops->init_key_from_rec(&key, recp);
2877
2878         optr = ptr;
2879
2880         XFS_BTREE_STATS_INC(cur, insrec);
2881
2882         /* Get pointers to the btree buffer and block. */
2883         block = xfs_btree_get_block(cur, level, &bp);
2884         numrecs = xfs_btree_get_numrecs(block);
2885
2886 #ifdef DEBUG
2887         error = xfs_btree_check_block(cur, block, level, bp);
2888         if (error)
2889                 goto error0;
2890
2891         /* Check that the new entry is being inserted in the right place. */
2892         if (ptr <= numrecs) {
2893                 if (level == 0) {
2894                         ASSERT(cur->bc_ops->recs_inorder(cur, recp,
2895                                 xfs_btree_rec_addr(cur, ptr, block)));
2896                 } else {
2897                         ASSERT(cur->bc_ops->keys_inorder(cur, &key,
2898                                 xfs_btree_key_addr(cur, ptr, block)));
2899                 }
2900         }
2901 #endif
2902
2903         /*
2904          * If the block is full, we can't insert the new entry until we
2905          * make the block un-full.
2906          */
2907         xfs_btree_set_ptr_null(cur, &nptr);
2908         if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
2909                 error = xfs_btree_make_block_unfull(cur, level, numrecs,
2910                                         &optr, &ptr, &nptr, &ncur, &nrec, stat);
2911                 if (error || *stat == 0)
2912                         goto error0;
2913         }
2914
2915         /*
2916          * The current block may have changed if the block was
2917          * previously full and we have just made space in it.
2918          */
2919         block = xfs_btree_get_block(cur, level, &bp);
2920         numrecs = xfs_btree_get_numrecs(block);
2921
2922 #ifdef DEBUG
2923         error = xfs_btree_check_block(cur, block, level, bp);
2924         if (error)
2925                 return error;
2926 #endif
2927
2928         /*
2929          * At this point we know there's room for our new entry in the block
2930          * we're pointing at.
2931          */
2932         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
2933
2934         if (level > 0) {
2935                 /* It's a nonleaf. make a hole in the keys and ptrs */
2936                 union xfs_btree_key     *kp;
2937                 union xfs_btree_ptr     *pp;
2938
2939                 kp = xfs_btree_key_addr(cur, ptr, block);
2940                 pp = xfs_btree_ptr_addr(cur, ptr, block);
2941
2942 #ifdef DEBUG
2943                 for (i = numrecs - ptr; i >= 0; i--) {
2944                         error = xfs_btree_check_ptr(cur, pp, i, level);
2945                         if (error)
2946                                 return error;
2947                 }
2948 #endif
2949
2950                 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
2951                 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
2952
2953 #ifdef DEBUG
2954                 error = xfs_btree_check_ptr(cur, ptrp, 0, level);
2955                 if (error)
2956                         goto error0;
2957 #endif
2958
2959                 /* Now put the new data in, bump numrecs and log it. */
2960                 xfs_btree_copy_keys(cur, kp, &key, 1);
2961                 xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
2962                 numrecs++;
2963                 xfs_btree_set_numrecs(block, numrecs);
2964                 xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
2965                 xfs_btree_log_keys(cur, bp, ptr, numrecs);
2966 #ifdef DEBUG
2967                 if (ptr < numrecs) {
2968                         ASSERT(cur->bc_ops->keys_inorder(cur, kp,
2969                                 xfs_btree_key_addr(cur, ptr + 1, block)));
2970                 }
2971 #endif
2972         } else {
2973                 /* It's a leaf. make a hole in the records */
2974                 union xfs_btree_rec             *rp;
2975
2976                 rp = xfs_btree_rec_addr(cur, ptr, block);
2977
2978                 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
2979
2980                 /* Now put the new data in, bump numrecs and log it. */
2981                 xfs_btree_copy_recs(cur, rp, recp, 1);
2982                 xfs_btree_set_numrecs(block, ++numrecs);
2983                 xfs_btree_log_recs(cur, bp, ptr, numrecs);
2984 #ifdef DEBUG
2985                 if (ptr < numrecs) {
2986                         ASSERT(cur->bc_ops->recs_inorder(cur, rp,
2987                                 xfs_btree_rec_addr(cur, ptr + 1, block)));
2988                 }
2989 #endif
2990         }
2991
2992         /* Log the new number of records in the btree header. */
2993         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
2994
2995         /* If we inserted at the start of a block, update the parents' keys. */
2996         if (optr == 1) {
2997                 error = xfs_btree_updkey(cur, &key, level + 1);
2998                 if (error)
2999                         goto error0;
3000         }
3001
3002         /*
3003          * If we are tracking the last record in the tree and
3004          * we are at the far right edge of the tree, update it.
3005          */
3006         if (xfs_btree_is_lastrec(cur, block, level)) {
3007                 cur->bc_ops->update_lastrec(cur, block, recp,
3008                                             ptr, LASTREC_INSREC);
3009         }
3010
3011         /*
3012          * Return the new block number, if any.
3013          * If there is one, give back a record value and a cursor too.
3014          */
3015         *ptrp = nptr;
3016         if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3017                 *recp = nrec;
3018                 *curp = ncur;
3019         }
3020
3021         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3022         *stat = 1;
3023         return 0;
3024
3025 error0:
3026         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3027         return error;
3028 }
3029
3030 /*
3031  * Insert the record at the point referenced by cur.
3032  *
3033  * A multi-level split of the tree on insert will invalidate the original
3034  * cursor.  All callers of this function should assume that the cursor is
3035  * no longer valid and revalidate it.
3036  */
3037 int
3038 xfs_btree_insert(
3039         struct xfs_btree_cur    *cur,
3040         int                     *stat)
3041 {
3042         int                     error;  /* error return value */
3043         int                     i;      /* result value, 0 for failure */
3044         int                     level;  /* current level number in btree */
3045         union xfs_btree_ptr     nptr;   /* new block number (split result) */
3046         struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3047         struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3048         union xfs_btree_rec     rec;    /* record to insert */
3049
3050         level = 0;
3051         ncur = NULL;
3052         pcur = cur;
3053
3054         xfs_btree_set_ptr_null(cur, &nptr);
3055         cur->bc_ops->init_rec_from_cur(cur, &rec);
3056
3057         /*
3058          * Loop going up the tree, starting at the leaf level.
3059          * Stop when we don't get a split block, that must mean that
3060          * the insert is finished with this level.
3061          */
3062         do {
3063                 /*
3064                  * Insert nrec/nptr into this level of the tree.
3065                  * Note if we fail, nptr will be null.
3066                  */
3067                 error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
3068                 if (error) {
3069                         if (pcur != cur)
3070                                 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3071                         goto error0;
3072                 }
3073
3074                 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3075                 level++;
3076
3077                 /*
3078                  * See if the cursor we just used is trash.
3079                  * Can't trash the caller's cursor, but otherwise we should
3080                  * if ncur is a new cursor or we're about to be done.
3081                  */
3082                 if (pcur != cur &&
3083                     (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3084                         /* Save the state from the cursor before we trash it */
3085                         if (cur->bc_ops->update_cursor)
3086                                 cur->bc_ops->update_cursor(pcur, cur);
3087                         cur->bc_nlevels = pcur->bc_nlevels;
3088                         xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3089                 }
3090                 /* If we got a new cursor, switch to it. */
3091                 if (ncur) {
3092                         pcur = ncur;
3093                         ncur = NULL;
3094                 }
3095         } while (!xfs_btree_ptr_is_null(cur, &nptr));
3096
3097         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3098         *stat = i;
3099         return 0;
3100 error0:
3101         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3102         return error;
3103 }
3104
3105 /*
3106  * Try to merge a non-leaf block back into the inode root.
3107  *
3108  * Note: the killroot names comes from the fact that we're effectively
3109  * killing the old root block.  But because we can't just delete the
3110  * inode we have to copy the single block it was pointing to into the
3111  * inode.
3112  */
3113 STATIC int
3114 xfs_btree_kill_iroot(
3115         struct xfs_btree_cur    *cur)
3116 {
3117         int                     whichfork = cur->bc_private.b.whichfork;
3118         struct xfs_inode        *ip = cur->bc_private.b.ip;
3119         struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
3120         struct xfs_btree_block  *block;
3121         struct xfs_btree_block  *cblock;
3122         union xfs_btree_key     *kp;
3123         union xfs_btree_key     *ckp;
3124         union xfs_btree_ptr     *pp;
3125         union xfs_btree_ptr     *cpp;
3126         struct xfs_buf          *cbp;
3127         int                     level;
3128         int                     index;
3129         int                     numrecs;
3130 #ifdef DEBUG
3131         union xfs_btree_ptr     ptr;
3132         int                     i;
3133 #endif
3134
3135         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3136
3137         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3138         ASSERT(cur->bc_nlevels > 1);
3139
3140         /*
3141          * Don't deal with the root block needs to be a leaf case.
3142          * We're just going to turn the thing back into extents anyway.
3143          */
3144         level = cur->bc_nlevels - 1;
3145         if (level == 1)
3146                 goto out0;
3147
3148         /*
3149          * Give up if the root has multiple children.
3150          */
3151         block = xfs_btree_get_iroot(cur);
3152         if (xfs_btree_get_numrecs(block) != 1)
3153                 goto out0;
3154
3155         cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3156         numrecs = xfs_btree_get_numrecs(cblock);
3157
3158         /*
3159          * Only do this if the next level will fit.
3160          * Then the data must be copied up to the inode,
3161          * instead of freeing the root you free the next level.
3162          */
3163         if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3164                 goto out0;
3165
3166         XFS_BTREE_STATS_INC(cur, killroot);
3167
3168 #ifdef DEBUG
3169         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3170         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3171         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3172         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3173 #endif
3174
3175         index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3176         if (index) {
3177                 xfs_iroot_realloc(cur->bc_private.b.ip, index,
3178                                   cur->bc_private.b.whichfork);
3179                 block = ifp->if_broot;
3180         }
3181
3182         be16_add_cpu(&block->bb_numrecs, index);
3183         ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3184
3185         kp = xfs_btree_key_addr(cur, 1, block);
3186         ckp = xfs_btree_key_addr(cur, 1, cblock);
3187         xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3188
3189         pp = xfs_btree_ptr_addr(cur, 1, block);
3190         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3191 #ifdef DEBUG
3192         for (i = 0; i < numrecs; i++) {
3193                 int             error;
3194
3195                 error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
3196                 if (error) {
3197                         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3198                         return error;
3199                 }
3200         }
3201 #endif
3202         xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3203
3204         cur->bc_ops->free_block(cur, cbp);
3205         XFS_BTREE_STATS_INC(cur, free);
3206
3207         cur->bc_bufs[level - 1] = NULL;
3208         be16_add_cpu(&block->bb_level, -1);
3209         xfs_trans_log_inode(cur->bc_tp, ip,
3210                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3211         cur->bc_nlevels--;
3212 out0:
3213         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3214         return 0;
3215 }
3216
3217 /*
3218  * Kill the current root node, and replace it with it's only child node.
3219  */
3220 STATIC int
3221 xfs_btree_kill_root(
3222         struct xfs_btree_cur    *cur,
3223         struct xfs_buf          *bp,
3224         int                     level,
3225         union xfs_btree_ptr     *newroot)
3226 {
3227         int                     error;
3228
3229         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3230         XFS_BTREE_STATS_INC(cur, killroot);
3231
3232         /*
3233          * Update the root pointer, decreasing the level by 1 and then
3234          * free the old root.
3235          */
3236         cur->bc_ops->set_root(cur, newroot, -1);
3237
3238         error = cur->bc_ops->free_block(cur, bp);
3239         if (error) {
3240                 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3241                 return error;
3242         }
3243
3244         XFS_BTREE_STATS_INC(cur, free);
3245
3246         cur->bc_bufs[level] = NULL;
3247         cur->bc_ra[level] = 0;
3248         cur->bc_nlevels--;
3249
3250         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3251         return 0;
3252 }
3253
3254 STATIC int
3255 xfs_btree_dec_cursor(
3256         struct xfs_btree_cur    *cur,
3257         int                     level,
3258         int                     *stat)
3259 {
3260         int                     error;
3261         int                     i;
3262
3263         if (level > 0) {
3264                 error = xfs_btree_decrement(cur, level, &i);
3265                 if (error)
3266                         return error;
3267         }
3268
3269         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3270         *stat = 1;
3271         return 0;
3272 }
3273
3274 /*
3275  * Single level of the btree record deletion routine.
3276  * Delete record pointed to by cur/level.
3277  * Remove the record from its block then rebalance the tree.
3278  * Return 0 for error, 1 for done, 2 to go on to the next level.
3279  */
3280 STATIC int                                      /* error */
3281 xfs_btree_delrec(
3282         struct xfs_btree_cur    *cur,           /* btree cursor */
3283         int                     level,          /* level removing record from */
3284         int                     *stat)          /* fail/done/go-on */
3285 {
3286         struct xfs_btree_block  *block;         /* btree block */
3287         union xfs_btree_ptr     cptr;           /* current block ptr */
3288         struct xfs_buf          *bp;            /* buffer for block */
3289         int                     error;          /* error return value */
3290         int                     i;              /* loop counter */
3291         union xfs_btree_key     key;            /* storage for keyp */
3292         union xfs_btree_key     *keyp = &key;   /* passed to the next level */
3293         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3294         struct xfs_buf          *lbp;           /* left buffer pointer */
3295         struct xfs_btree_block  *left;          /* left btree block */
3296         int                     lrecs = 0;      /* left record count */
3297         int                     ptr;            /* key/record index */
3298         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3299         struct xfs_buf          *rbp;           /* right buffer pointer */
3300         struct xfs_btree_block  *right;         /* right btree block */
3301         struct xfs_btree_block  *rrblock;       /* right-right btree block */
3302         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3303         int                     rrecs = 0;      /* right record count */
3304         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3305         int                     numrecs;        /* temporary numrec count */
3306
3307         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3308         XFS_BTREE_TRACE_ARGI(cur, level);
3309
3310         tcur = NULL;
3311
3312         /* Get the index of the entry being deleted, check for nothing there. */
3313         ptr = cur->bc_ptrs[level];
3314         if (ptr == 0) {
3315                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3316                 *stat = 0;
3317                 return 0;
3318         }
3319
3320         /* Get the buffer & block containing the record or key/ptr. */
3321         block = xfs_btree_get_block(cur, level, &bp);
3322         numrecs = xfs_btree_get_numrecs(block);
3323
3324 #ifdef DEBUG
3325         error = xfs_btree_check_block(cur, block, level, bp);
3326         if (error)
3327                 goto error0;
3328 #endif
3329
3330         /* Fail if we're off the end of the block. */
3331         if (ptr > numrecs) {
3332                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3333                 *stat = 0;
3334                 return 0;
3335         }
3336
3337         XFS_BTREE_STATS_INC(cur, delrec);
3338         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3339
3340         /* Excise the entries being deleted. */
3341         if (level > 0) {
3342                 /* It's a nonleaf. operate on keys and ptrs */
3343                 union xfs_btree_key     *lkp;
3344                 union xfs_btree_ptr     *lpp;
3345
3346                 lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3347                 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3348
3349 #ifdef DEBUG
3350                 for (i = 0; i < numrecs - ptr; i++) {
3351                         error = xfs_btree_check_ptr(cur, lpp, i, level);
3352                         if (error)
3353                                 goto error0;
3354                 }
3355 #endif
3356
3357                 if (ptr < numrecs) {
3358                         xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3359                         xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3360                         xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3361                         xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3362                 }
3363
3364                 /*
3365                  * If it's the first record in the block, we'll need to pass a
3366                  * key up to the next level (updkey).
3367                  */
3368                 if (ptr == 1)
3369                         keyp = xfs_btree_key_addr(cur, 1, block);
3370         } else {
3371                 /* It's a leaf. operate on records */
3372                 if (ptr < numrecs) {
3373                         xfs_btree_shift_recs(cur,
3374                                 xfs_btree_rec_addr(cur, ptr + 1, block),
3375                                 -1, numrecs - ptr);
3376                         xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3377                 }
3378
3379                 /*
3380                  * If it's the first record in the block, we'll need a key
3381                  * structure to pass up to the next level (updkey).
3382                  */
3383                 if (ptr == 1) {
3384                         cur->bc_ops->init_key_from_rec(&key,
3385                                         xfs_btree_rec_addr(cur, 1, block));
3386                         keyp = &key;
3387                 }
3388         }
3389
3390         /*
3391          * Decrement and log the number of entries in the block.
3392          */
3393         xfs_btree_set_numrecs(block, --numrecs);
3394         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3395
3396         /*
3397          * If we are tracking the last record in the tree and
3398          * we are at the far right edge of the tree, update it.
3399          */
3400         if (xfs_btree_is_lastrec(cur, block, level)) {
3401                 cur->bc_ops->update_lastrec(cur, block, NULL,
3402                                             ptr, LASTREC_DELREC);
3403         }
3404
3405         /*
3406          * We're at the root level.  First, shrink the root block in-memory.
3407          * Try to get rid of the next level down.  If we can't then there's
3408          * nothing left to do.
3409          */
3410         if (level == cur->bc_nlevels - 1) {
3411                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3412                         xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3413                                           cur->bc_private.b.whichfork);
3414
3415                         error = xfs_btree_kill_iroot(cur);
3416                         if (error)
3417                                 goto error0;
3418
3419                         error = xfs_btree_dec_cursor(cur, level, stat);
3420                         if (error)
3421                                 goto error0;
3422                         *stat = 1;
3423                         return 0;
3424                 }
3425
3426                 /*
3427                  * If this is the root level, and there's only one entry left,
3428                  * and it's NOT the leaf level, then we can get rid of this
3429                  * level.
3430                  */
3431                 if (numrecs == 1 && level > 0) {
3432                         union xfs_btree_ptr     *pp;
3433                         /*
3434                          * pp is still set to the first pointer in the block.
3435                          * Make it the new root of the btree.
3436                          */
3437                         pp = xfs_btree_ptr_addr(cur, 1, block);
3438                         error = xfs_btree_kill_root(cur, bp, level, pp);
3439                         if (error)
3440                                 goto error0;
3441                 } else if (level > 0) {
3442                         error = xfs_btree_dec_cursor(cur, level, stat);
3443                         if (error)
3444                                 goto error0;
3445                 }
3446                 *stat = 1;
3447                 return 0;
3448         }
3449
3450         /*
3451          * If we deleted the leftmost entry in the block, update the
3452          * key values above us in the tree.
3453          */
3454         if (ptr == 1) {
3455                 error = xfs_btree_updkey(cur, keyp, level + 1);
3456                 if (error)
3457                         goto error0;
3458         }
3459
3460         /*
3461          * If the number of records remaining in the block is at least
3462          * the minimum, we're done.
3463          */
3464         if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3465                 error = xfs_btree_dec_cursor(cur, level, stat);
3466                 if (error)
3467                         goto error0;
3468                 return 0;
3469         }
3470
3471         /*
3472          * Otherwise, we have to move some records around to keep the
3473          * tree balanced.  Look at the left and right sibling blocks to
3474          * see if we can re-balance by moving only one record.
3475          */
3476         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3477         xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3478
3479         if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3480                 /*
3481                  * One child of root, need to get a chance to copy its contents
3482                  * into the root and delete it. Can't go up to next level,
3483                  * there's nothing to delete there.
3484                  */
3485                 if (xfs_btree_ptr_is_null(cur, &rptr) &&
3486                     xfs_btree_ptr_is_null(cur, &lptr) &&
3487                     level == cur->bc_nlevels - 2) {
3488                         error = xfs_btree_kill_iroot(cur);
3489                         if (!error)
3490                                 error = xfs_btree_dec_cursor(cur, level, stat);
3491                         if (error)
3492                                 goto error0;
3493                         return 0;
3494                 }
3495         }
3496
3497         ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3498                !xfs_btree_ptr_is_null(cur, &lptr));
3499
3500         /*
3501          * Duplicate the cursor so our btree manipulations here won't
3502          * disrupt the next level up.
3503          */
3504         error = xfs_btree_dup_cursor(cur, &tcur);
3505         if (error)
3506                 goto error0;
3507
3508         /*
3509          * If there's a right sibling, see if it's ok to shift an entry
3510          * out of it.
3511          */
3512         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3513                 /*
3514                  * Move the temp cursor to the last entry in the next block.
3515                  * Actually any entry but the first would suffice.
3516                  */
3517                 i = xfs_btree_lastrec(tcur, level);
3518                 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3519
3520                 error = xfs_btree_increment(tcur, level, &i);
3521                 if (error)
3522                         goto error0;
3523                 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3524
3525                 i = xfs_btree_lastrec(tcur, level);
3526                 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3527
3528                 /* Grab a pointer to the block. */
3529                 right = xfs_btree_get_block(tcur, level, &rbp);
3530 #ifdef DEBUG
3531                 error = xfs_btree_check_block(tcur, right, level, rbp);
3532                 if (error)
3533                         goto error0;
3534 #endif
3535                 /* Grab the current block number, for future use. */
3536                 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3537
3538                 /*
3539                  * If right block is full enough so that removing one entry
3540                  * won't make it too empty, and left-shifting an entry out
3541                  * of right to us works, we're done.
3542                  */
3543                 if (xfs_btree_get_numrecs(right) - 1 >=
3544                     cur->bc_ops->get_minrecs(tcur, level)) {
3545                         error = xfs_btree_lshift(tcur, level, &i);
3546                         if (error)
3547                                 goto error0;
3548                         if (i) {
3549                                 ASSERT(xfs_btree_get_numrecs(block) >=
3550                                        cur->bc_ops->get_minrecs(tcur, level));
3551
3552                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3553                                 tcur = NULL;
3554
3555                                 error = xfs_btree_dec_cursor(cur, level, stat);
3556                                 if (error)
3557                                         goto error0;
3558                                 return 0;
3559                         }
3560                 }
3561
3562                 /*
3563                  * Otherwise, grab the number of records in right for
3564                  * future reference, and fix up the temp cursor to point
3565                  * to our block again (last record).
3566                  */
3567                 rrecs = xfs_btree_get_numrecs(right);
3568                 if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3569                         i = xfs_btree_firstrec(tcur, level);
3570                         XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3571
3572                         error = xfs_btree_decrement(tcur, level, &i);
3573                         if (error)
3574                                 goto error0;
3575                         XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3576                 }
3577         }
3578
3579         /*
3580          * If there's a left sibling, see if it's ok to shift an entry
3581          * out of it.
3582          */
3583         if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3584                 /*
3585                  * Move the temp cursor to the first entry in the
3586                  * previous block.
3587                  */
3588                 i = xfs_btree_firstrec(tcur, level);
3589                 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3590
3591                 error = xfs_btree_decrement(tcur, level, &i);
3592                 if (error)
3593                         goto error0;
3594                 i = xfs_btree_firstrec(tcur, level);
3595                 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3596
3597                 /* Grab a pointer to the block. */
3598                 left = xfs_btree_get_block(tcur, level, &lbp);
3599 #ifdef DEBUG
3600                 error = xfs_btree_check_block(cur, left, level, lbp);
3601                 if (error)
3602                         goto error0;
3603 #endif
3604                 /* Grab the current block number, for future use. */
3605                 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3606
3607                 /*
3608                  * If left block is full enough so that removing one entry
3609                  * won't make it too empty, and right-shifting an entry out
3610                  * of left to us works, we're done.
3611                  */
3612                 if (xfs_btree_get_numrecs(left) - 1 >=
3613                     cur->bc_ops->get_minrecs(tcur, level)) {
3614                         error = xfs_btree_rshift(tcur, level, &i);
3615                         if (error)
3616                                 goto error0;
3617                         if (i) {
3618                                 ASSERT(xfs_btree_get_numrecs(block) >=
3619                                        cur->bc_ops->get_minrecs(tcur, level));
3620                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3621                                 tcur = NULL;
3622                                 if (level == 0)
3623                                         cur->bc_ptrs[0]++;
3624                                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3625                                 *stat = 1;
3626                                 return 0;
3627                         }
3628                 }
3629
3630                 /*
3631                  * Otherwise, grab the number of records in right for
3632                  * future reference.
3633                  */
3634                 lrecs = xfs_btree_get_numrecs(left);
3635         }
3636
3637         /* Delete the temp cursor, we're done with it. */
3638         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3639         tcur = NULL;
3640
3641         /* If here, we need to do a join to keep the tree balanced. */
3642         ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3643
3644         if (!xfs_btree_ptr_is_null(cur, &lptr) &&
3645             lrecs + xfs_btree_get_numrecs(block) <=
3646                         cur->bc_ops->get_maxrecs(cur, level)) {
3647                 /*
3648                  * Set "right" to be the starting block,
3649                  * "left" to be the left neighbor.
3650                  */
3651                 rptr = cptr;
3652                 right = block;
3653                 rbp = bp;
3654                 error = xfs_btree_read_buf_block(cur, &lptr, level,
3655                                                         0, &left, &lbp);
3656                 if (error)
3657                         goto error0;
3658
3659         /*
3660          * If that won't work, see if we can join with the right neighbor block.
3661          */
3662         } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
3663                    rrecs + xfs_btree_get_numrecs(block) <=
3664                         cur->bc_ops->get_maxrecs(cur, level)) {
3665                 /*
3666                  * Set "left" to be the starting block,
3667                  * "right" to be the right neighbor.
3668                  */
3669                 lptr = cptr;
3670                 left = block;
3671                 lbp = bp;
3672                 error = xfs_btree_read_buf_block(cur, &rptr, level,
3673                                                         0, &right, &rbp);
3674                 if (error)
3675                         goto error0;
3676
3677         /*
3678          * Otherwise, we can't fix the imbalance.
3679          * Just return.  This is probably a logic error, but it's not fatal.
3680          */
3681         } else {
3682                 error = xfs_btree_dec_cursor(cur, level, stat);
3683                 if (error)
3684                         goto error0;
3685                 return 0;
3686         }
3687
3688         rrecs = xfs_btree_get_numrecs(right);
3689         lrecs = xfs_btree_get_numrecs(left);
3690
3691         /*
3692          * We're now going to join "left" and "right" by moving all the stuff
3693          * in "right" to "left" and deleting "right".
3694          */
3695         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
3696         if (level > 0) {
3697                 /* It's a non-leaf.  Move keys and pointers. */
3698                 union xfs_btree_key     *lkp;   /* left btree key */
3699                 union xfs_btree_ptr     *lpp;   /* left address pointer */
3700                 union xfs_btree_key     *rkp;   /* right btree key */
3701                 union xfs_btree_ptr     *rpp;   /* right address pointer */
3702
3703                 lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
3704                 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
3705                 rkp = xfs_btree_key_addr(cur, 1, right);
3706                 rpp = xfs_btree_ptr_addr(cur, 1, right);
3707 #ifdef DEBUG
3708                 for (i = 1; i < rrecs; i++) {
3709                         error = xfs_btree_check_ptr(cur, rpp, i, level);
3710                         if (error)
3711                                 goto error0;
3712                 }
3713 #endif
3714                 xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
3715                 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
3716
3717                 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
3718                 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
3719         } else {
3720                 /* It's a leaf.  Move records.  */
3721                 union xfs_btree_rec     *lrp;   /* left record pointer */
3722                 union xfs_btree_rec     *rrp;   /* right record pointer */
3723
3724                 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
3725                 rrp = xfs_btree_rec_addr(cur, 1, right);
3726
3727                 xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
3728                 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
3729         }
3730
3731         XFS_BTREE_STATS_INC(cur, join);
3732
3733         /*
3734          * Fix up the number of records and right block pointer in the
3735          * surviving block, and log it.
3736          */
3737         xfs_btree_set_numrecs(left, lrecs + rrecs);
3738         xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
3739         xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3740         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
3741
3742         /* If there is a right sibling, point it to the remaining block. */
3743         xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3744         if (!xfs_btree_ptr_is_null(cur, &cptr)) {
3745                 error = xfs_btree_read_buf_block(cur, &cptr, level,
3746                                                         0, &rrblock, &rrbp);
3747                 if (error)
3748                         goto error0;
3749                 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
3750                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
3751         }
3752
3753         /* Free the deleted block. */
3754         error = cur->bc_ops->free_block(cur, rbp);
3755         if (error)
3756                 goto error0;
3757         XFS_BTREE_STATS_INC(cur, free);
3758
3759         /*
3760          * If we joined with the left neighbor, set the buffer in the
3761          * cursor to the left block, and fix up the index.
3762          */
3763         if (bp != lbp) {
3764                 cur->bc_bufs[level] = lbp;
3765                 cur->bc_ptrs[level] += lrecs;
3766                 cur->bc_ra[level] = 0;
3767         }
3768         /*
3769          * If we joined with the right neighbor and there's a level above
3770          * us, increment the cursor at that level.
3771          */
3772         else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
3773                    (level + 1 < cur->bc_nlevels)) {
3774                 error = xfs_btree_increment(cur, level + 1, &i);
3775                 if (error)
3776                         goto error0;
3777         }
3778
3779         /*
3780          * Readjust the ptr at this level if it's not a leaf, since it's
3781          * still pointing at the deletion point, which makes the cursor
3782          * inconsistent.  If this makes the ptr 0, the caller fixes it up.
3783          * We can't use decrement because it would change the next level up.
3784          */
3785         if (level > 0)
3786                 cur->bc_ptrs[level]--;
3787
3788         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3789         /* Return value means the next level up has something to do. */
3790         *stat = 2;
3791         return 0;
3792
3793 error0:
3794         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3795         if (tcur)
3796                 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
3797         return error;
3798 }
3799
3800 /*
3801  * Delete the record pointed to by cur.
3802  * The cursor refers to the place where the record was (could be inserted)
3803  * when the operation returns.
3804  */
3805 int                                     /* error */
3806 xfs_btree_delete(
3807         struct xfs_btree_cur    *cur,
3808         int                     *stat)  /* success/failure */
3809 {
3810         int                     error;  /* error return value */
3811         int                     level;
3812         int                     i;
3813
3814         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3815
3816         /*
3817          * Go up the tree, starting at leaf level.
3818          *
3819          * If 2 is returned then a join was done; go to the next level.
3820          * Otherwise we are done.
3821          */
3822         for (level = 0, i = 2; i == 2; level++) {
3823                 error = xfs_btree_delrec(cur, level, &i);
3824                 if (error)
3825                         goto error0;
3826         }
3827
3828         if (i == 0) {
3829                 for (level = 1; level < cur->bc_nlevels; level++) {
3830                         if (cur->bc_ptrs[level] == 0) {
3831                                 error = xfs_btree_decrement(cur, level, &i);
3832                                 if (error)
3833                                         goto error0;
3834                                 break;
3835                         }
3836                 }
3837         }
3838
3839         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3840         *stat = i;
3841         return 0;
3842 error0:
3843         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3844         return error;
3845 }
3846
3847 /*
3848  * Get the data from the pointed-to record.
3849  */
3850 int                                     /* error */
3851 xfs_btree_get_rec(
3852         struct xfs_btree_cur    *cur,   /* btree cursor */
3853         union xfs_btree_rec     **recp, /* output: btree record */
3854         int                     *stat)  /* output: success/failure */
3855 {
3856         struct xfs_btree_block  *block; /* btree block */
3857         struct xfs_buf          *bp;    /* buffer pointer */
3858         int                     ptr;    /* record number */
3859 #ifdef DEBUG
3860         int                     error;  /* error return value */
3861 #endif
3862
3863         ptr = cur->bc_ptrs[0];
3864         block = xfs_btree_get_block(cur, 0, &bp);
3865
3866 #ifdef DEBUG
3867         error = xfs_btree_check_block(cur, block, 0, bp);
3868         if (error)
3869                 return error;
3870 #endif
3871
3872         /*
3873          * Off the right end or left end, return failure.
3874          */
3875         if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
3876                 *stat = 0;
3877                 return 0;
3878         }
3879
3880         /*
3881          * Point to the record and extract its data.
3882          */
3883         *recp = xfs_btree_rec_addr(cur, ptr, block);
3884         *stat = 1;
3885         return 0;
3886 }
3887
3888 /*
3889  * Change the owner of a btree.
3890  *
3891  * The mechanism we use here is ordered buffer logging. Because we don't know
3892  * how many buffers were are going to need to modify, we don't really want to
3893  * have to make transaction reservations for the worst case of every buffer in a
3894  * full size btree as that may be more space that we can fit in the log....
3895  *
3896  * We do the btree walk in the most optimal manner possible - we have sibling
3897  * pointers so we can just walk all the blocks on each level from left to right
3898  * in a single pass, and then move to the next level and do the same. We can
3899  * also do readahead on the sibling pointers to get IO moving more quickly,
3900  * though for slow disks this is unlikely to make much difference to performance
3901  * as the amount of CPU work we have to do before moving to the next block is
3902  * relatively small.
3903  *
3904  * For each btree block that we load, modify the owner appropriately, set the
3905  * buffer as an ordered buffer and log it appropriately. We need to ensure that
3906  * we mark the region we change dirty so that if the buffer is relogged in
3907  * a subsequent transaction the changes we make here as an ordered buffer are
3908  * correctly relogged in that transaction.  If we are in recovery context, then
3909  * just queue the modified buffer as delayed write buffer so the transaction
3910  * recovery completion writes the changes to disk.
3911  */
3912 static int
3913 xfs_btree_block_change_owner(
3914         struct xfs_btree_cur    *cur,
3915         int                     level,
3916         __uint64_t              new_owner,
3917         struct list_head        *buffer_list)
3918 {
3919         struct xfs_btree_block  *block;
3920         struct xfs_buf          *bp;
3921         union xfs_btree_ptr     rptr;
3922
3923         /* do right sibling readahead */
3924         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
3925
3926         /* modify the owner */
3927         block = xfs_btree_get_block(cur, level, &bp);
3928         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
3929                 block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
3930         else
3931                 block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
3932
3933         /*
3934          * If the block is a root block hosted in an inode, we might not have a
3935          * buffer pointer here and we shouldn't attempt to log the change as the
3936          * information is already held in the inode and discarded when the root
3937          * block is formatted into the on-disk inode fork. We still change it,
3938          * though, so everything is consistent in memory.
3939          */
3940         if (bp) {
3941                 if (cur->bc_tp) {
3942                         xfs_trans_ordered_buf(cur->bc_tp, bp);
3943                         xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
3944                 } else {
3945                         xfs_buf_delwri_queue(bp, buffer_list);
3946                 }
3947         } else {
3948                 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3949                 ASSERT(level == cur->bc_nlevels - 1);
3950         }
3951
3952         /* now read rh sibling block for next iteration */
3953         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3954         if (xfs_btree_ptr_is_null(cur, &rptr))
3955                 return ENOENT;
3956
3957         return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
3958 }
3959
3960 int
3961 xfs_btree_change_owner(
3962         struct xfs_btree_cur    *cur,
3963         __uint64_t              new_owner,
3964         struct list_head        *buffer_list)
3965 {
3966         union xfs_btree_ptr     lptr;
3967         int                     level;
3968         struct xfs_btree_block  *block = NULL;
3969         int                     error = 0;
3970
3971         cur->bc_ops->init_ptr_from_cur(cur, &lptr);
3972
3973         /* for each level */
3974         for (level = cur->bc_nlevels - 1; level >= 0; level--) {
3975                 /* grab the left hand block */
3976                 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
3977                 if (error)
3978                         return error;
3979
3980                 /* readahead the left most block for the next level down */
3981                 if (level > 0) {
3982                         union xfs_btree_ptr     *ptr;
3983
3984                         ptr = xfs_btree_ptr_addr(cur, 1, block);
3985                         xfs_btree_readahead_ptr(cur, ptr, 1);
3986
3987                         /* save for the next iteration of the loop */
3988                         lptr = *ptr;
3989                 }
3990
3991                 /* for each buffer in the level */
3992                 do {
3993                         error = xfs_btree_block_change_owner(cur, level,
3994                                                              new_owner,
3995                                                              buffer_list);
3996                 } while (!error);
3997
3998                 if (error != ENOENT)
3999                         return error;
4000         }
4001
4002         return 0;
4003 }