]> Pileus Git - ~andy/linux/blob - fs/ocfs2/suballoc.c
Linux 3.14
[~andy/linux] / fs / ocfs2 / suballoc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31
32 #include <cluster/masklog.h>
33
34 #include "ocfs2.h"
35
36 #include "alloc.h"
37 #include "blockcheck.h"
38 #include "dlmglue.h"
39 #include "inode.h"
40 #include "journal.h"
41 #include "localalloc.h"
42 #include "suballoc.h"
43 #include "super.h"
44 #include "sysfile.h"
45 #include "uptodate.h"
46 #include "ocfs2_trace.h"
47
48 #include "buffer_head_io.h"
49
50 #define NOT_ALLOC_NEW_GROUP             0
51 #define ALLOC_NEW_GROUP                 0x1
52 #define ALLOC_GROUPS_FROM_GLOBAL        0x2
53
54 #define OCFS2_MAX_TO_STEAL              1024
55
56 struct ocfs2_suballoc_result {
57         u64             sr_bg_blkno;    /* The bg we allocated from.  Set
58                                            to 0 when a block group is
59                                            contiguous. */
60         u64             sr_bg_stable_blkno; /*
61                                              * Doesn't change, always
62                                              * set to target block
63                                              * group descriptor
64                                              * block.
65                                              */
66         u64             sr_blkno;       /* The first allocated block */
67         unsigned int    sr_bit_offset;  /* The bit in the bg */
68         unsigned int    sr_bits;        /* How many bits we claimed */
69 };
70
71 static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
72 {
73         if (res->sr_blkno == 0)
74                 return 0;
75
76         if (res->sr_bg_blkno)
77                 return res->sr_bg_blkno;
78
79         return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
80 }
81
82 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
83 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
84 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
85 static int ocfs2_block_group_fill(handle_t *handle,
86                                   struct inode *alloc_inode,
87                                   struct buffer_head *bg_bh,
88                                   u64 group_blkno,
89                                   unsigned int group_clusters,
90                                   u16 my_chain,
91                                   struct ocfs2_chain_list *cl);
92 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
93                                    struct inode *alloc_inode,
94                                    struct buffer_head *bh,
95                                    u64 max_block,
96                                    u64 *last_alloc_group,
97                                    int flags);
98
99 static int ocfs2_cluster_group_search(struct inode *inode,
100                                       struct buffer_head *group_bh,
101                                       u32 bits_wanted, u32 min_bits,
102                                       u64 max_block,
103                                       struct ocfs2_suballoc_result *res);
104 static int ocfs2_block_group_search(struct inode *inode,
105                                     struct buffer_head *group_bh,
106                                     u32 bits_wanted, u32 min_bits,
107                                     u64 max_block,
108                                     struct ocfs2_suballoc_result *res);
109 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
110                                      handle_t *handle,
111                                      u32 bits_wanted,
112                                      u32 min_bits,
113                                      struct ocfs2_suballoc_result *res);
114 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
115                                          int nr);
116 static int ocfs2_relink_block_group(handle_t *handle,
117                                     struct inode *alloc_inode,
118                                     struct buffer_head *fe_bh,
119                                     struct buffer_head *bg_bh,
120                                     struct buffer_head *prev_bg_bh,
121                                     u16 chain);
122 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
123                                                      u32 wanted);
124 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
125                                                    u64 bg_blkno,
126                                                    u16 bg_bit_off);
127 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
128                                                 u64 data_blkno,
129                                                 u64 *bg_blkno,
130                                                 u16 *bg_bit_off);
131 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
132                                              u32 bits_wanted, u64 max_block,
133                                              int flags,
134                                              struct ocfs2_alloc_context **ac);
135
136 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
137 {
138         struct inode *inode = ac->ac_inode;
139
140         if (inode) {
141                 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
142                         ocfs2_inode_unlock(inode, 1);
143
144                 mutex_unlock(&inode->i_mutex);
145
146                 iput(inode);
147                 ac->ac_inode = NULL;
148         }
149         brelse(ac->ac_bh);
150         ac->ac_bh = NULL;
151         ac->ac_resv = NULL;
152         if (ac->ac_find_loc_priv) {
153                 kfree(ac->ac_find_loc_priv);
154                 ac->ac_find_loc_priv = NULL;
155         }
156 }
157
158 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
159 {
160         ocfs2_free_ac_resource(ac);
161         kfree(ac);
162 }
163
164 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
165 {
166         return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
167 }
168
169 #define do_error(fmt, ...)                                              \
170         do{                                                             \
171                 if (resize)                                     \
172                         mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
173                 else                                                    \
174                         ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
175         } while (0)
176
177 static int ocfs2_validate_gd_self(struct super_block *sb,
178                                   struct buffer_head *bh,
179                                   int resize)
180 {
181         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
182
183         if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
184                 do_error("Group descriptor #%llu has bad signature %.*s",
185                          (unsigned long long)bh->b_blocknr, 7,
186                          gd->bg_signature);
187                 return -EINVAL;
188         }
189
190         if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
191                 do_error("Group descriptor #%llu has an invalid bg_blkno "
192                          "of %llu",
193                          (unsigned long long)bh->b_blocknr,
194                          (unsigned long long)le64_to_cpu(gd->bg_blkno));
195                 return -EINVAL;
196         }
197
198         if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
199                 do_error("Group descriptor #%llu has an invalid "
200                          "fs_generation of #%u",
201                          (unsigned long long)bh->b_blocknr,
202                          le32_to_cpu(gd->bg_generation));
203                 return -EINVAL;
204         }
205
206         if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
207                 do_error("Group descriptor #%llu has bit count %u but "
208                          "claims that %u are free",
209                          (unsigned long long)bh->b_blocknr,
210                          le16_to_cpu(gd->bg_bits),
211                          le16_to_cpu(gd->bg_free_bits_count));
212                 return -EINVAL;
213         }
214
215         if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
216                 do_error("Group descriptor #%llu has bit count %u but "
217                          "max bitmap bits of %u",
218                          (unsigned long long)bh->b_blocknr,
219                          le16_to_cpu(gd->bg_bits),
220                          8 * le16_to_cpu(gd->bg_size));
221                 return -EINVAL;
222         }
223
224         return 0;
225 }
226
227 static int ocfs2_validate_gd_parent(struct super_block *sb,
228                                     struct ocfs2_dinode *di,
229                                     struct buffer_head *bh,
230                                     int resize)
231 {
232         unsigned int max_bits;
233         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
234
235         if (di->i_blkno != gd->bg_parent_dinode) {
236                 do_error("Group descriptor #%llu has bad parent "
237                          "pointer (%llu, expected %llu)",
238                          (unsigned long long)bh->b_blocknr,
239                          (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
240                          (unsigned long long)le64_to_cpu(di->i_blkno));
241                 return -EINVAL;
242         }
243
244         max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
245         if (le16_to_cpu(gd->bg_bits) > max_bits) {
246                 do_error("Group descriptor #%llu has bit count of %u",
247                          (unsigned long long)bh->b_blocknr,
248                          le16_to_cpu(gd->bg_bits));
249                 return -EINVAL;
250         }
251
252         /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
253         if ((le16_to_cpu(gd->bg_chain) >
254              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
255             ((le16_to_cpu(gd->bg_chain) ==
256              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
257                 do_error("Group descriptor #%llu has bad chain %u",
258                          (unsigned long long)bh->b_blocknr,
259                          le16_to_cpu(gd->bg_chain));
260                 return -EINVAL;
261         }
262
263         return 0;
264 }
265
266 #undef do_error
267
268 /*
269  * This version only prints errors.  It does not fail the filesystem, and
270  * exists only for resize.
271  */
272 int ocfs2_check_group_descriptor(struct super_block *sb,
273                                  struct ocfs2_dinode *di,
274                                  struct buffer_head *bh)
275 {
276         int rc;
277         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
278
279         BUG_ON(!buffer_uptodate(bh));
280
281         /*
282          * If the ecc fails, we return the error but otherwise
283          * leave the filesystem running.  We know any error is
284          * local to this block.
285          */
286         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
287         if (rc) {
288                 mlog(ML_ERROR,
289                      "Checksum failed for group descriptor %llu\n",
290                      (unsigned long long)bh->b_blocknr);
291         } else
292                 rc = ocfs2_validate_gd_self(sb, bh, 1);
293         if (!rc)
294                 rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
295
296         return rc;
297 }
298
299 static int ocfs2_validate_group_descriptor(struct super_block *sb,
300                                            struct buffer_head *bh)
301 {
302         int rc;
303         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
304
305         trace_ocfs2_validate_group_descriptor(
306                                         (unsigned long long)bh->b_blocknr);
307
308         BUG_ON(!buffer_uptodate(bh));
309
310         /*
311          * If the ecc fails, we return the error but otherwise
312          * leave the filesystem running.  We know any error is
313          * local to this block.
314          */
315         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
316         if (rc)
317                 return rc;
318
319         /*
320          * Errors after here are fatal.
321          */
322
323         return ocfs2_validate_gd_self(sb, bh, 0);
324 }
325
326 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
327                                 u64 gd_blkno, struct buffer_head **bh)
328 {
329         int rc;
330         struct buffer_head *tmp = *bh;
331
332         rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
333                               ocfs2_validate_group_descriptor);
334         if (rc)
335                 goto out;
336
337         rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
338         if (rc) {
339                 brelse(tmp);
340                 goto out;
341         }
342
343         /* If ocfs2_read_block() got us a new bh, pass it up. */
344         if (!*bh)
345                 *bh = tmp;
346
347 out:
348         return rc;
349 }
350
351 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
352                                           struct ocfs2_group_desc *bg,
353                                           struct ocfs2_chain_list *cl,
354                                           u64 p_blkno, unsigned int clusters)
355 {
356         struct ocfs2_extent_list *el = &bg->bg_list;
357         struct ocfs2_extent_rec *rec;
358
359         BUG_ON(!ocfs2_supports_discontig_bg(osb));
360         if (!el->l_next_free_rec)
361                 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
362         rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
363         rec->e_blkno = cpu_to_le64(p_blkno);
364         rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
365                                   le16_to_cpu(cl->cl_bpc));
366         rec->e_leaf_clusters = cpu_to_le16(clusters);
367         le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
368         le16_add_cpu(&bg->bg_free_bits_count,
369                      clusters * le16_to_cpu(cl->cl_bpc));
370         le16_add_cpu(&el->l_next_free_rec, 1);
371 }
372
373 static int ocfs2_block_group_fill(handle_t *handle,
374                                   struct inode *alloc_inode,
375                                   struct buffer_head *bg_bh,
376                                   u64 group_blkno,
377                                   unsigned int group_clusters,
378                                   u16 my_chain,
379                                   struct ocfs2_chain_list *cl)
380 {
381         int status = 0;
382         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
383         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
384         struct super_block * sb = alloc_inode->i_sb;
385
386         if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
387                 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
388                             "b_blocknr (%llu)",
389                             (unsigned long long)group_blkno,
390                             (unsigned long long) bg_bh->b_blocknr);
391                 status = -EIO;
392                 goto bail;
393         }
394
395         status = ocfs2_journal_access_gd(handle,
396                                          INODE_CACHE(alloc_inode),
397                                          bg_bh,
398                                          OCFS2_JOURNAL_ACCESS_CREATE);
399         if (status < 0) {
400                 mlog_errno(status);
401                 goto bail;
402         }
403
404         memset(bg, 0, sb->s_blocksize);
405         strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
406         bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
407         bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
408                                                 osb->s_feature_incompat));
409         bg->bg_chain = cpu_to_le16(my_chain);
410         bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
411         bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
412         bg->bg_blkno = cpu_to_le64(group_blkno);
413         if (group_clusters == le16_to_cpu(cl->cl_cpg))
414                 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
415         else
416                 ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
417                                               group_clusters);
418
419         /* set the 1st bit in the bitmap to account for the descriptor block */
420         ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
421         bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
422
423         ocfs2_journal_dirty(handle, bg_bh);
424
425         /* There is no need to zero out or otherwise initialize the
426          * other blocks in a group - All valid FS metadata in a block
427          * group stores the superblock fs_generation value at
428          * allocation time. */
429
430 bail:
431         if (status)
432                 mlog_errno(status);
433         return status;
434 }
435
436 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
437 {
438         u16 curr, best;
439
440         best = curr = 0;
441         while (curr < le16_to_cpu(cl->cl_count)) {
442                 if (le32_to_cpu(cl->cl_recs[best].c_total) >
443                     le32_to_cpu(cl->cl_recs[curr].c_total))
444                         best = curr;
445                 curr++;
446         }
447         return best;
448 }
449
450 static struct buffer_head *
451 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
452                                struct inode *alloc_inode,
453                                struct ocfs2_alloc_context *ac,
454                                struct ocfs2_chain_list *cl)
455 {
456         int status;
457         u32 bit_off, num_bits;
458         u64 bg_blkno;
459         struct buffer_head *bg_bh;
460         unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
461
462         status = ocfs2_claim_clusters(handle, ac,
463                                       le16_to_cpu(cl->cl_cpg), &bit_off,
464                                       &num_bits);
465         if (status < 0) {
466                 if (status != -ENOSPC)
467                         mlog_errno(status);
468                 goto bail;
469         }
470
471         /* setup the group */
472         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
473         trace_ocfs2_block_group_alloc_contig(
474              (unsigned long long)bg_blkno, alloc_rec);
475
476         bg_bh = sb_getblk(osb->sb, bg_blkno);
477         if (!bg_bh) {
478                 status = -ENOMEM;
479                 mlog_errno(status);
480                 goto bail;
481         }
482         ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
483
484         status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
485                                         bg_blkno, num_bits, alloc_rec, cl);
486         if (status < 0) {
487                 brelse(bg_bh);
488                 mlog_errno(status);
489         }
490
491 bail:
492         return status ? ERR_PTR(status) : bg_bh;
493 }
494
495 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
496                                         handle_t *handle,
497                                         struct ocfs2_alloc_context *ac,
498                                         unsigned int min_bits,
499                                         u32 *bit_off, u32 *num_bits)
500 {
501         int status = 0;
502
503         while (min_bits) {
504                 status = ocfs2_claim_clusters(handle, ac, min_bits,
505                                               bit_off, num_bits);
506                 if (status != -ENOSPC)
507                         break;
508
509                 min_bits >>= 1;
510         }
511
512         return status;
513 }
514
515 static int ocfs2_block_group_grow_discontig(handle_t *handle,
516                                             struct inode *alloc_inode,
517                                             struct buffer_head *bg_bh,
518                                             struct ocfs2_alloc_context *ac,
519                                             struct ocfs2_chain_list *cl,
520                                             unsigned int min_bits)
521 {
522         int status;
523         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
524         struct ocfs2_group_desc *bg =
525                 (struct ocfs2_group_desc *)bg_bh->b_data;
526         unsigned int needed = le16_to_cpu(cl->cl_cpg) -
527                          le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
528         u32 p_cpos, clusters;
529         u64 p_blkno;
530         struct ocfs2_extent_list *el = &bg->bg_list;
531
532         status = ocfs2_journal_access_gd(handle,
533                                          INODE_CACHE(alloc_inode),
534                                          bg_bh,
535                                          OCFS2_JOURNAL_ACCESS_CREATE);
536         if (status < 0) {
537                 mlog_errno(status);
538                 goto bail;
539         }
540
541         while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
542                                 le16_to_cpu(el->l_count))) {
543                 if (min_bits > needed)
544                         min_bits = needed;
545                 status = ocfs2_block_group_claim_bits(osb, handle, ac,
546                                                       min_bits, &p_cpos,
547                                                       &clusters);
548                 if (status < 0) {
549                         if (status != -ENOSPC)
550                                 mlog_errno(status);
551                         goto bail;
552                 }
553                 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
554                 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
555                                               clusters);
556
557                 min_bits = clusters;
558                 needed = le16_to_cpu(cl->cl_cpg) -
559                          le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
560         }
561
562         if (needed > 0) {
563                 /*
564                  * We have used up all the extent rec but can't fill up
565                  * the cpg. So bail out.
566                  */
567                 status = -ENOSPC;
568                 goto bail;
569         }
570
571         ocfs2_journal_dirty(handle, bg_bh);
572
573 bail:
574         return status;
575 }
576
577 static void ocfs2_bg_alloc_cleanup(handle_t *handle,
578                                    struct ocfs2_alloc_context *cluster_ac,
579                                    struct inode *alloc_inode,
580                                    struct buffer_head *bg_bh)
581 {
582         int i, ret;
583         struct ocfs2_group_desc *bg;
584         struct ocfs2_extent_list *el;
585         struct ocfs2_extent_rec *rec;
586
587         if (!bg_bh)
588                 return;
589
590         bg = (struct ocfs2_group_desc *)bg_bh->b_data;
591         el = &bg->bg_list;
592         for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
593                 rec = &el->l_recs[i];
594                 ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
595                                           cluster_ac->ac_bh,
596                                           le64_to_cpu(rec->e_blkno),
597                                           le16_to_cpu(rec->e_leaf_clusters));
598                 if (ret)
599                         mlog_errno(ret);
600                 /* Try all the clusters to free */
601         }
602
603         ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
604         brelse(bg_bh);
605 }
606
607 static struct buffer_head *
608 ocfs2_block_group_alloc_discontig(handle_t *handle,
609                                   struct inode *alloc_inode,
610                                   struct ocfs2_alloc_context *ac,
611                                   struct ocfs2_chain_list *cl)
612 {
613         int status;
614         u32 bit_off, num_bits;
615         u64 bg_blkno;
616         unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
617         struct buffer_head *bg_bh = NULL;
618         unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
619         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
620
621         if (!ocfs2_supports_discontig_bg(osb)) {
622                 status = -ENOSPC;
623                 goto bail;
624         }
625
626         status = ocfs2_extend_trans(handle,
627                                     ocfs2_calc_bg_discontig_credits(osb->sb));
628         if (status) {
629                 mlog_errno(status);
630                 goto bail;
631         }
632
633         /*
634          * We're going to be grabbing from multiple cluster groups.
635          * We don't have enough credits to relink them all, and the
636          * cluster groups will be staying in cache for the duration of
637          * this operation.
638          */
639         ac->ac_disable_chain_relink = 1;
640
641         /* Claim the first region */
642         status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
643                                               &bit_off, &num_bits);
644         if (status < 0) {
645                 if (status != -ENOSPC)
646                         mlog_errno(status);
647                 goto bail;
648         }
649         min_bits = num_bits;
650
651         /* setup the group */
652         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
653         trace_ocfs2_block_group_alloc_discontig(
654                                 (unsigned long long)bg_blkno, alloc_rec);
655
656         bg_bh = sb_getblk(osb->sb, bg_blkno);
657         if (!bg_bh) {
658                 status = -ENOMEM;
659                 mlog_errno(status);
660                 goto bail;
661         }
662         ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
663
664         status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
665                                         bg_blkno, num_bits, alloc_rec, cl);
666         if (status < 0) {
667                 mlog_errno(status);
668                 goto bail;
669         }
670
671         status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
672                                                   bg_bh, ac, cl, min_bits);
673         if (status)
674                 mlog_errno(status);
675
676 bail:
677         if (status)
678                 ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
679         return status ? ERR_PTR(status) : bg_bh;
680 }
681
682 /*
683  * We expect the block group allocator to already be locked.
684  */
685 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
686                                    struct inode *alloc_inode,
687                                    struct buffer_head *bh,
688                                    u64 max_block,
689                                    u64 *last_alloc_group,
690                                    int flags)
691 {
692         int status, credits;
693         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
694         struct ocfs2_chain_list *cl;
695         struct ocfs2_alloc_context *ac = NULL;
696         handle_t *handle = NULL;
697         u16 alloc_rec;
698         struct buffer_head *bg_bh = NULL;
699         struct ocfs2_group_desc *bg;
700
701         BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
702
703         cl = &fe->id2.i_chain;
704         status = ocfs2_reserve_clusters_with_limit(osb,
705                                                    le16_to_cpu(cl->cl_cpg),
706                                                    max_block, flags, &ac);
707         if (status < 0) {
708                 if (status != -ENOSPC)
709                         mlog_errno(status);
710                 goto bail;
711         }
712
713         credits = ocfs2_calc_group_alloc_credits(osb->sb,
714                                                  le16_to_cpu(cl->cl_cpg));
715         handle = ocfs2_start_trans(osb, credits);
716         if (IS_ERR(handle)) {
717                 status = PTR_ERR(handle);
718                 handle = NULL;
719                 mlog_errno(status);
720                 goto bail;
721         }
722
723         if (last_alloc_group && *last_alloc_group != 0) {
724                 trace_ocfs2_block_group_alloc(
725                                 (unsigned long long)*last_alloc_group);
726                 ac->ac_last_group = *last_alloc_group;
727         }
728
729         bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
730                                                ac, cl);
731         if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
732                 bg_bh = ocfs2_block_group_alloc_discontig(handle,
733                                                           alloc_inode,
734                                                           ac, cl);
735         if (IS_ERR(bg_bh)) {
736                 status = PTR_ERR(bg_bh);
737                 bg_bh = NULL;
738                 if (status != -ENOSPC)
739                         mlog_errno(status);
740                 goto bail;
741         }
742         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
743
744         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
745                                          bh, OCFS2_JOURNAL_ACCESS_WRITE);
746         if (status < 0) {
747                 mlog_errno(status);
748                 goto bail;
749         }
750
751         alloc_rec = le16_to_cpu(bg->bg_chain);
752         le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
753                      le16_to_cpu(bg->bg_free_bits_count));
754         le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
755                      le16_to_cpu(bg->bg_bits));
756         cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
757         if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
758                 le16_add_cpu(&cl->cl_next_free_rec, 1);
759
760         le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
761                                         le16_to_cpu(bg->bg_free_bits_count));
762         le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
763         le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
764
765         ocfs2_journal_dirty(handle, bh);
766
767         spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
768         OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
769         fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
770                                              le32_to_cpu(fe->i_clusters)));
771         spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
772         i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
773         alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
774
775         status = 0;
776
777         /* save the new last alloc group so that the caller can cache it. */
778         if (last_alloc_group)
779                 *last_alloc_group = ac->ac_last_group;
780
781 bail:
782         if (handle)
783                 ocfs2_commit_trans(osb, handle);
784
785         if (ac)
786                 ocfs2_free_alloc_context(ac);
787
788         brelse(bg_bh);
789
790         if (status)
791                 mlog_errno(status);
792         return status;
793 }
794
795 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
796                                        struct ocfs2_alloc_context *ac,
797                                        int type,
798                                        u32 slot,
799                                        u64 *last_alloc_group,
800                                        int flags)
801 {
802         int status;
803         u32 bits_wanted = ac->ac_bits_wanted;
804         struct inode *alloc_inode;
805         struct buffer_head *bh = NULL;
806         struct ocfs2_dinode *fe;
807         u32 free_bits;
808
809         alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
810         if (!alloc_inode) {
811                 mlog_errno(-EINVAL);
812                 return -EINVAL;
813         }
814
815         mutex_lock(&alloc_inode->i_mutex);
816
817         status = ocfs2_inode_lock(alloc_inode, &bh, 1);
818         if (status < 0) {
819                 mutex_unlock(&alloc_inode->i_mutex);
820                 iput(alloc_inode);
821
822                 mlog_errno(status);
823                 return status;
824         }
825
826         ac->ac_inode = alloc_inode;
827         ac->ac_alloc_slot = slot;
828
829         fe = (struct ocfs2_dinode *) bh->b_data;
830
831         /* The bh was validated by the inode read inside
832          * ocfs2_inode_lock().  Any corruption is a code bug. */
833         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
834
835         if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
836                 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
837                             (unsigned long long)le64_to_cpu(fe->i_blkno));
838                 status = -EIO;
839                 goto bail;
840         }
841
842         free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
843                 le32_to_cpu(fe->id1.bitmap1.i_used);
844
845         if (bits_wanted > free_bits) {
846                 /* cluster bitmap never grows */
847                 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
848                         trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
849                                                                 free_bits);
850                         status = -ENOSPC;
851                         goto bail;
852                 }
853
854                 if (!(flags & ALLOC_NEW_GROUP)) {
855                         trace_ocfs2_reserve_suballoc_bits_no_new_group(
856                                                 slot, bits_wanted, free_bits);
857                         status = -ENOSPC;
858                         goto bail;
859                 }
860
861                 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
862                                                  ac->ac_max_block,
863                                                  last_alloc_group, flags);
864                 if (status < 0) {
865                         if (status != -ENOSPC)
866                                 mlog_errno(status);
867                         goto bail;
868                 }
869                 atomic_inc(&osb->alloc_stats.bg_extends);
870
871                 /* You should never ask for this much metadata */
872                 BUG_ON(bits_wanted >
873                        (le32_to_cpu(fe->id1.bitmap1.i_total)
874                         - le32_to_cpu(fe->id1.bitmap1.i_used)));
875         }
876
877         get_bh(bh);
878         ac->ac_bh = bh;
879 bail:
880         brelse(bh);
881
882         if (status)
883                 mlog_errno(status);
884         return status;
885 }
886
887 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
888 {
889         spin_lock(&osb->osb_lock);
890         osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
891         spin_unlock(&osb->osb_lock);
892         atomic_set(&osb->s_num_inodes_stolen, 0);
893 }
894
895 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
896 {
897         spin_lock(&osb->osb_lock);
898         osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
899         spin_unlock(&osb->osb_lock);
900         atomic_set(&osb->s_num_meta_stolen, 0);
901 }
902
903 void ocfs2_init_steal_slots(struct ocfs2_super *osb)
904 {
905         ocfs2_init_inode_steal_slot(osb);
906         ocfs2_init_meta_steal_slot(osb);
907 }
908
909 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
910 {
911         spin_lock(&osb->osb_lock);
912         if (type == INODE_ALLOC_SYSTEM_INODE)
913                 osb->s_inode_steal_slot = slot;
914         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
915                 osb->s_meta_steal_slot = slot;
916         spin_unlock(&osb->osb_lock);
917 }
918
919 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
920 {
921         int slot = OCFS2_INVALID_SLOT;
922
923         spin_lock(&osb->osb_lock);
924         if (type == INODE_ALLOC_SYSTEM_INODE)
925                 slot = osb->s_inode_steal_slot;
926         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
927                 slot = osb->s_meta_steal_slot;
928         spin_unlock(&osb->osb_lock);
929
930         return slot;
931 }
932
933 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
934 {
935         return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
936 }
937
938 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
939 {
940         return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
941 }
942
943 static int ocfs2_steal_resource(struct ocfs2_super *osb,
944                                 struct ocfs2_alloc_context *ac,
945                                 int type)
946 {
947         int i, status = -ENOSPC;
948         int slot = __ocfs2_get_steal_slot(osb, type);
949
950         /* Start to steal resource from the first slot after ours. */
951         if (slot == OCFS2_INVALID_SLOT)
952                 slot = osb->slot_num + 1;
953
954         for (i = 0; i < osb->max_slots; i++, slot++) {
955                 if (slot == osb->max_slots)
956                         slot = 0;
957
958                 if (slot == osb->slot_num)
959                         continue;
960
961                 status = ocfs2_reserve_suballoc_bits(osb, ac,
962                                                      type,
963                                                      (u32)slot, NULL,
964                                                      NOT_ALLOC_NEW_GROUP);
965                 if (status >= 0) {
966                         __ocfs2_set_steal_slot(osb, slot, type);
967                         break;
968                 }
969
970                 ocfs2_free_ac_resource(ac);
971         }
972
973         return status;
974 }
975
976 static int ocfs2_steal_inode(struct ocfs2_super *osb,
977                              struct ocfs2_alloc_context *ac)
978 {
979         return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
980 }
981
982 static int ocfs2_steal_meta(struct ocfs2_super *osb,
983                             struct ocfs2_alloc_context *ac)
984 {
985         return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
986 }
987
988 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
989                                       int blocks,
990                                       struct ocfs2_alloc_context **ac)
991 {
992         int status;
993         int slot = ocfs2_get_meta_steal_slot(osb);
994
995         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
996         if (!(*ac)) {
997                 status = -ENOMEM;
998                 mlog_errno(status);
999                 goto bail;
1000         }
1001
1002         (*ac)->ac_bits_wanted = blocks;
1003         (*ac)->ac_which = OCFS2_AC_USE_META;
1004         (*ac)->ac_group_search = ocfs2_block_group_search;
1005
1006         if (slot != OCFS2_INVALID_SLOT &&
1007                 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
1008                 goto extent_steal;
1009
1010         atomic_set(&osb->s_num_meta_stolen, 0);
1011         status = ocfs2_reserve_suballoc_bits(osb, (*ac),
1012                                              EXTENT_ALLOC_SYSTEM_INODE,
1013                                              (u32)osb->slot_num, NULL,
1014                                              ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
1015
1016
1017         if (status >= 0) {
1018                 status = 0;
1019                 if (slot != OCFS2_INVALID_SLOT)
1020                         ocfs2_init_meta_steal_slot(osb);
1021                 goto bail;
1022         } else if (status < 0 && status != -ENOSPC) {
1023                 mlog_errno(status);
1024                 goto bail;
1025         }
1026
1027         ocfs2_free_ac_resource(*ac);
1028
1029 extent_steal:
1030         status = ocfs2_steal_meta(osb, *ac);
1031         atomic_inc(&osb->s_num_meta_stolen);
1032         if (status < 0) {
1033                 if (status != -ENOSPC)
1034                         mlog_errno(status);
1035                 goto bail;
1036         }
1037
1038         status = 0;
1039 bail:
1040         if ((status < 0) && *ac) {
1041                 ocfs2_free_alloc_context(*ac);
1042                 *ac = NULL;
1043         }
1044
1045         if (status)
1046                 mlog_errno(status);
1047         return status;
1048 }
1049
1050 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
1051                                struct ocfs2_extent_list *root_el,
1052                                struct ocfs2_alloc_context **ac)
1053 {
1054         return ocfs2_reserve_new_metadata_blocks(osb,
1055                                         ocfs2_extend_meta_needed(root_el),
1056                                         ac);
1057 }
1058
1059 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
1060                             struct ocfs2_alloc_context **ac)
1061 {
1062         int status;
1063         int slot = ocfs2_get_inode_steal_slot(osb);
1064         u64 alloc_group;
1065
1066         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1067         if (!(*ac)) {
1068                 status = -ENOMEM;
1069                 mlog_errno(status);
1070                 goto bail;
1071         }
1072
1073         (*ac)->ac_bits_wanted = 1;
1074         (*ac)->ac_which = OCFS2_AC_USE_INODE;
1075
1076         (*ac)->ac_group_search = ocfs2_block_group_search;
1077
1078         /*
1079          * stat(2) can't handle i_ino > 32bits, so we tell the
1080          * lower levels not to allocate us a block group past that
1081          * limit.  The 'inode64' mount option avoids this behavior.
1082          */
1083         if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
1084                 (*ac)->ac_max_block = (u32)~0U;
1085
1086         /*
1087          * slot is set when we successfully steal inode from other nodes.
1088          * It is reset in 3 places:
1089          * 1. when we flush the truncate log
1090          * 2. when we complete local alloc recovery.
1091          * 3. when we successfully allocate from our own slot.
1092          * After it is set, we will go on stealing inodes until we find the
1093          * need to check our slots to see whether there is some space for us.
1094          */
1095         if (slot != OCFS2_INVALID_SLOT &&
1096             atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
1097                 goto inode_steal;
1098
1099         atomic_set(&osb->s_num_inodes_stolen, 0);
1100         alloc_group = osb->osb_inode_alloc_group;
1101         status = ocfs2_reserve_suballoc_bits(osb, *ac,
1102                                              INODE_ALLOC_SYSTEM_INODE,
1103                                              (u32)osb->slot_num,
1104                                              &alloc_group,
1105                                              ALLOC_NEW_GROUP |
1106                                              ALLOC_GROUPS_FROM_GLOBAL);
1107         if (status >= 0) {
1108                 status = 0;
1109
1110                 spin_lock(&osb->osb_lock);
1111                 osb->osb_inode_alloc_group = alloc_group;
1112                 spin_unlock(&osb->osb_lock);
1113                 trace_ocfs2_reserve_new_inode_new_group(
1114                         (unsigned long long)alloc_group);
1115
1116                 /*
1117                  * Some inodes must be freed by us, so try to allocate
1118                  * from our own next time.
1119                  */
1120                 if (slot != OCFS2_INVALID_SLOT)
1121                         ocfs2_init_inode_steal_slot(osb);
1122                 goto bail;
1123         } else if (status < 0 && status != -ENOSPC) {
1124                 mlog_errno(status);
1125                 goto bail;
1126         }
1127
1128         ocfs2_free_ac_resource(*ac);
1129
1130 inode_steal:
1131         status = ocfs2_steal_inode(osb, *ac);
1132         atomic_inc(&osb->s_num_inodes_stolen);
1133         if (status < 0) {
1134                 if (status != -ENOSPC)
1135                         mlog_errno(status);
1136                 goto bail;
1137         }
1138
1139         status = 0;
1140 bail:
1141         if ((status < 0) && *ac) {
1142                 ocfs2_free_alloc_context(*ac);
1143                 *ac = NULL;
1144         }
1145
1146         if (status)
1147                 mlog_errno(status);
1148         return status;
1149 }
1150
1151 /* local alloc code has to do the same thing, so rather than do this
1152  * twice.. */
1153 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
1154                                       struct ocfs2_alloc_context *ac)
1155 {
1156         int status;
1157
1158         ac->ac_which = OCFS2_AC_USE_MAIN;
1159         ac->ac_group_search = ocfs2_cluster_group_search;
1160
1161         status = ocfs2_reserve_suballoc_bits(osb, ac,
1162                                              GLOBAL_BITMAP_SYSTEM_INODE,
1163                                              OCFS2_INVALID_SLOT, NULL,
1164                                              ALLOC_NEW_GROUP);
1165         if (status < 0 && status != -ENOSPC) {
1166                 mlog_errno(status);
1167                 goto bail;
1168         }
1169
1170 bail:
1171         return status;
1172 }
1173
1174 /* Callers don't need to care which bitmap (local alloc or main) to
1175  * use so we figure it out for them, but unfortunately this clutters
1176  * things a bit. */
1177 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
1178                                              u32 bits_wanted, u64 max_block,
1179                                              int flags,
1180                                              struct ocfs2_alloc_context **ac)
1181 {
1182         int status;
1183
1184         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1185         if (!(*ac)) {
1186                 status = -ENOMEM;
1187                 mlog_errno(status);
1188                 goto bail;
1189         }
1190
1191         (*ac)->ac_bits_wanted = bits_wanted;
1192         (*ac)->ac_max_block = max_block;
1193
1194         status = -ENOSPC;
1195         if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
1196             ocfs2_alloc_should_use_local(osb, bits_wanted)) {
1197                 status = ocfs2_reserve_local_alloc_bits(osb,
1198                                                         bits_wanted,
1199                                                         *ac);
1200                 if ((status < 0) && (status != -ENOSPC)) {
1201                         mlog_errno(status);
1202                         goto bail;
1203                 }
1204         }
1205
1206         if (status == -ENOSPC) {
1207                 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1208                 if (status < 0) {
1209                         if (status != -ENOSPC)
1210                                 mlog_errno(status);
1211                         goto bail;
1212                 }
1213         }
1214
1215         status = 0;
1216 bail:
1217         if ((status < 0) && *ac) {
1218                 ocfs2_free_alloc_context(*ac);
1219                 *ac = NULL;
1220         }
1221
1222         if (status)
1223                 mlog_errno(status);
1224         return status;
1225 }
1226
1227 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
1228                            u32 bits_wanted,
1229                            struct ocfs2_alloc_context **ac)
1230 {
1231         return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
1232                                                  ALLOC_NEW_GROUP, ac);
1233 }
1234
1235 /*
1236  * More or less lifted from ext3. I'll leave their description below:
1237  *
1238  * "For ext3 allocations, we must not reuse any blocks which are
1239  * allocated in the bitmap buffer's "last committed data" copy.  This
1240  * prevents deletes from freeing up the page for reuse until we have
1241  * committed the delete transaction.
1242  *
1243  * If we didn't do this, then deleting something and reallocating it as
1244  * data would allow the old block to be overwritten before the
1245  * transaction committed (because we force data to disk before commit).
1246  * This would lead to corruption if we crashed between overwriting the
1247  * data and committing the delete.
1248  *
1249  * @@@ We may want to make this allocation behaviour conditional on
1250  * data-writes at some point, and disable it for metadata allocations or
1251  * sync-data inodes."
1252  *
1253  * Note: OCFS2 already does this differently for metadata vs data
1254  * allocations, as those bitmaps are separate and undo access is never
1255  * called on a metadata group descriptor.
1256  */
1257 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1258                                          int nr)
1259 {
1260         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1261         int ret;
1262
1263         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1264                 return 0;
1265
1266         if (!buffer_jbd(bg_bh))
1267                 return 1;
1268
1269         jbd_lock_bh_state(bg_bh);
1270         bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
1271         if (bg)
1272                 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1273         else
1274                 ret = 1;
1275         jbd_unlock_bh_state(bg_bh);
1276
1277         return ret;
1278 }
1279
1280 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1281                                              struct buffer_head *bg_bh,
1282                                              unsigned int bits_wanted,
1283                                              unsigned int total_bits,
1284                                              struct ocfs2_suballoc_result *res)
1285 {
1286         void *bitmap;
1287         u16 best_offset, best_size;
1288         int offset, start, found, status = 0;
1289         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1290
1291         /* Callers got this descriptor from
1292          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1293         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1294
1295         found = start = best_offset = best_size = 0;
1296         bitmap = bg->bg_bitmap;
1297
1298         while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
1299                 if (offset == total_bits)
1300                         break;
1301
1302                 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1303                         /* We found a zero, but we can't use it as it
1304                          * hasn't been put to disk yet! */
1305                         found = 0;
1306                         start = offset + 1;
1307                 } else if (offset == start) {
1308                         /* we found a zero */
1309                         found++;
1310                         /* move start to the next bit to test */
1311                         start++;
1312                 } else {
1313                         /* got a zero after some ones */
1314                         found = 1;
1315                         start = offset + 1;
1316                 }
1317                 if (found > best_size) {
1318                         best_size = found;
1319                         best_offset = start - found;
1320                 }
1321                 /* we got everything we needed */
1322                 if (found == bits_wanted) {
1323                         /* mlog(0, "Found it all!\n"); */
1324                         break;
1325                 }
1326         }
1327
1328         if (best_size) {
1329                 res->sr_bit_offset = best_offset;
1330                 res->sr_bits = best_size;
1331         } else {
1332                 status = -ENOSPC;
1333                 /* No error log here -- see the comment above
1334                  * ocfs2_test_bg_bit_allocatable */
1335         }
1336
1337         return status;
1338 }
1339
1340 int ocfs2_block_group_set_bits(handle_t *handle,
1341                                              struct inode *alloc_inode,
1342                                              struct ocfs2_group_desc *bg,
1343                                              struct buffer_head *group_bh,
1344                                              unsigned int bit_off,
1345                                              unsigned int num_bits)
1346 {
1347         int status;
1348         void *bitmap = bg->bg_bitmap;
1349         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1350
1351         /* All callers get the descriptor via
1352          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1353         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1354         BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1355
1356         trace_ocfs2_block_group_set_bits(bit_off, num_bits);
1357
1358         if (ocfs2_is_cluster_bitmap(alloc_inode))
1359                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1360
1361         status = ocfs2_journal_access_gd(handle,
1362                                          INODE_CACHE(alloc_inode),
1363                                          group_bh,
1364                                          journal_type);
1365         if (status < 0) {
1366                 mlog_errno(status);
1367                 goto bail;
1368         }
1369
1370         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1371         if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1372                 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
1373                             " count %u but claims %u are freed. num_bits %d",
1374                             (unsigned long long)le64_to_cpu(bg->bg_blkno),
1375                             le16_to_cpu(bg->bg_bits),
1376                             le16_to_cpu(bg->bg_free_bits_count), num_bits);
1377                 return -EROFS;
1378         }
1379         while(num_bits--)
1380                 ocfs2_set_bit(bit_off++, bitmap);
1381
1382         ocfs2_journal_dirty(handle, group_bh);
1383
1384 bail:
1385         return status;
1386 }
1387
1388 /* find the one with the most empty bits */
1389 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1390 {
1391         u16 curr, best;
1392
1393         BUG_ON(!cl->cl_next_free_rec);
1394
1395         best = curr = 0;
1396         while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1397                 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1398                     le32_to_cpu(cl->cl_recs[best].c_free))
1399                         best = curr;
1400                 curr++;
1401         }
1402
1403         BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1404         return best;
1405 }
1406
1407 static int ocfs2_relink_block_group(handle_t *handle,
1408                                     struct inode *alloc_inode,
1409                                     struct buffer_head *fe_bh,
1410                                     struct buffer_head *bg_bh,
1411                                     struct buffer_head *prev_bg_bh,
1412                                     u16 chain)
1413 {
1414         int status;
1415         /* there is a really tiny chance the journal calls could fail,
1416          * but we wouldn't want inconsistent blocks in *any* case. */
1417         u64 bg_ptr, prev_bg_ptr;
1418         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1419         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1420         struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1421
1422         /* The caller got these descriptors from
1423          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1424         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1425         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1426
1427         trace_ocfs2_relink_block_group(
1428                 (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1429                 (unsigned long long)le64_to_cpu(bg->bg_blkno),
1430                 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1431
1432         bg_ptr = le64_to_cpu(bg->bg_next_group);
1433         prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1434
1435         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1436                                          prev_bg_bh,
1437                                          OCFS2_JOURNAL_ACCESS_WRITE);
1438         if (status < 0)
1439                 goto out;
1440
1441         prev_bg->bg_next_group = bg->bg_next_group;
1442         ocfs2_journal_dirty(handle, prev_bg_bh);
1443
1444         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1445                                          bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1446         if (status < 0)
1447                 goto out_rollback_prev_bg;
1448
1449         bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1450         ocfs2_journal_dirty(handle, bg_bh);
1451
1452         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1453                                          fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1454         if (status < 0)
1455                 goto out_rollback_bg;
1456
1457         fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1458         ocfs2_journal_dirty(handle, fe_bh);
1459
1460 out:
1461         if (status < 0)
1462                 mlog_errno(status);
1463         return status;
1464
1465 out_rollback_bg:
1466         bg->bg_next_group = cpu_to_le64(bg_ptr);
1467 out_rollback_prev_bg:
1468         prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1469         goto out;
1470 }
1471
1472 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1473                                                      u32 wanted)
1474 {
1475         return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1476 }
1477
1478 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1479  * value on error. */
1480 static int ocfs2_cluster_group_search(struct inode *inode,
1481                                       struct buffer_head *group_bh,
1482                                       u32 bits_wanted, u32 min_bits,
1483                                       u64 max_block,
1484                                       struct ocfs2_suballoc_result *res)
1485 {
1486         int search = -ENOSPC;
1487         int ret;
1488         u64 blkoff;
1489         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1490         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1491         unsigned int max_bits, gd_cluster_off;
1492
1493         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1494
1495         if (gd->bg_free_bits_count) {
1496                 max_bits = le16_to_cpu(gd->bg_bits);
1497
1498                 /* Tail groups in cluster bitmaps which aren't cpg
1499                  * aligned are prone to partial extension by a failed
1500                  * fs resize. If the file system resize never got to
1501                  * update the dinode cluster count, then we don't want
1502                  * to trust any clusters past it, regardless of what
1503                  * the group descriptor says. */
1504                 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1505                                                           le64_to_cpu(gd->bg_blkno));
1506                 if ((gd_cluster_off + max_bits) >
1507                     OCFS2_I(inode)->ip_clusters) {
1508                         max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1509                         trace_ocfs2_cluster_group_search_wrong_max_bits(
1510                                 (unsigned long long)le64_to_cpu(gd->bg_blkno),
1511                                 le16_to_cpu(gd->bg_bits),
1512                                 OCFS2_I(inode)->ip_clusters, max_bits);
1513                 }
1514
1515                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1516                                                         group_bh, bits_wanted,
1517                                                         max_bits, res);
1518                 if (ret)
1519                         return ret;
1520
1521                 if (max_block) {
1522                         blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1523                                                           gd_cluster_off +
1524                                                           res->sr_bit_offset +
1525                                                           res->sr_bits);
1526                         trace_ocfs2_cluster_group_search_max_block(
1527                                 (unsigned long long)blkoff,
1528                                 (unsigned long long)max_block);
1529                         if (blkoff > max_block)
1530                                 return -ENOSPC;
1531                 }
1532
1533                 /* ocfs2_block_group_find_clear_bits() might
1534                  * return success, but we still want to return
1535                  * -ENOSPC unless it found the minimum number
1536                  * of bits. */
1537                 if (min_bits <= res->sr_bits)
1538                         search = 0; /* success */
1539                 else if (res->sr_bits) {
1540                         /*
1541                          * Don't show bits which we'll be returning
1542                          * for allocation to the local alloc bitmap.
1543                          */
1544                         ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
1545                 }
1546         }
1547
1548         return search;
1549 }
1550
1551 static int ocfs2_block_group_search(struct inode *inode,
1552                                     struct buffer_head *group_bh,
1553                                     u32 bits_wanted, u32 min_bits,
1554                                     u64 max_block,
1555                                     struct ocfs2_suballoc_result *res)
1556 {
1557         int ret = -ENOSPC;
1558         u64 blkoff;
1559         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1560
1561         BUG_ON(min_bits != 1);
1562         BUG_ON(ocfs2_is_cluster_bitmap(inode));
1563
1564         if (bg->bg_free_bits_count) {
1565                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1566                                                         group_bh, bits_wanted,
1567                                                         le16_to_cpu(bg->bg_bits),
1568                                                         res);
1569                 if (!ret && max_block) {
1570                         blkoff = le64_to_cpu(bg->bg_blkno) +
1571                                 res->sr_bit_offset + res->sr_bits;
1572                         trace_ocfs2_block_group_search_max_block(
1573                                 (unsigned long long)blkoff,
1574                                 (unsigned long long)max_block);
1575                         if (blkoff > max_block)
1576                                 ret = -ENOSPC;
1577                 }
1578         }
1579
1580         return ret;
1581 }
1582
1583 int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1584                                        handle_t *handle,
1585                                        struct buffer_head *di_bh,
1586                                        u32 num_bits,
1587                                        u16 chain)
1588 {
1589         int ret;
1590         u32 tmp_used;
1591         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1592         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1593
1594         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1595                                       OCFS2_JOURNAL_ACCESS_WRITE);
1596         if (ret < 0) {
1597                 mlog_errno(ret);
1598                 goto out;
1599         }
1600
1601         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1602         di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1603         le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1604         ocfs2_journal_dirty(handle, di_bh);
1605
1606 out:
1607         return ret;
1608 }
1609
1610 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1611                                          struct ocfs2_extent_rec *rec,
1612                                          struct ocfs2_chain_list *cl)
1613 {
1614         unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1615         unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1616         unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc;
1617
1618         if (res->sr_bit_offset < bitoff)
1619                 return 0;
1620         if (res->sr_bit_offset >= (bitoff + bitcount))
1621                 return 0;
1622         res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1623                 (res->sr_bit_offset - bitoff);
1624         if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1625                 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1626         return 1;
1627 }
1628
1629 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1630                                           struct ocfs2_group_desc *bg,
1631                                           struct ocfs2_suballoc_result *res)
1632 {
1633         int i;
1634         u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
1635         struct ocfs2_extent_rec *rec;
1636         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1637         struct ocfs2_chain_list *cl = &di->id2.i_chain;
1638
1639         if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1640                 res->sr_blkno = 0;
1641                 return;
1642         }
1643
1644         res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1645         res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
1646         if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1647             !bg->bg_list.l_next_free_rec)
1648                 return;
1649
1650         for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1651                 rec = &bg->bg_list.l_recs[i];
1652                 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1653                         res->sr_bg_blkno = bg_blkno;  /* Restore */
1654                         break;
1655                 }
1656         }
1657 }
1658
1659 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1660                                   handle_t *handle,
1661                                   u32 bits_wanted,
1662                                   u32 min_bits,
1663                                   struct ocfs2_suballoc_result *res,
1664                                   u16 *bits_left)
1665 {
1666         int ret;
1667         struct buffer_head *group_bh = NULL;
1668         struct ocfs2_group_desc *gd;
1669         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1670         struct inode *alloc_inode = ac->ac_inode;
1671
1672         ret = ocfs2_read_group_descriptor(alloc_inode, di,
1673                                           res->sr_bg_blkno, &group_bh);
1674         if (ret < 0) {
1675                 mlog_errno(ret);
1676                 return ret;
1677         }
1678
1679         gd = (struct ocfs2_group_desc *) group_bh->b_data;
1680         ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1681                                   ac->ac_max_block, res);
1682         if (ret < 0) {
1683                 if (ret != -ENOSPC)
1684                         mlog_errno(ret);
1685                 goto out;
1686         }
1687
1688         if (!ret)
1689                 ocfs2_bg_discontig_fix_result(ac, gd, res);
1690
1691         /*
1692          * sr_bg_blkno might have been changed by
1693          * ocfs2_bg_discontig_fix_result
1694          */
1695         res->sr_bg_stable_blkno = group_bh->b_blocknr;
1696
1697         if (ac->ac_find_loc_only)
1698                 goto out_loc_only;
1699
1700         ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1701                                                res->sr_bits,
1702                                                le16_to_cpu(gd->bg_chain));
1703         if (ret < 0) {
1704                 mlog_errno(ret);
1705                 goto out;
1706         }
1707
1708         ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1709                                          res->sr_bit_offset, res->sr_bits);
1710         if (ret < 0)
1711                 mlog_errno(ret);
1712
1713 out_loc_only:
1714         *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1715
1716 out:
1717         brelse(group_bh);
1718
1719         return ret;
1720 }
1721
1722 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1723                               handle_t *handle,
1724                               u32 bits_wanted,
1725                               u32 min_bits,
1726                               struct ocfs2_suballoc_result *res,
1727                               u16 *bits_left)
1728 {
1729         int status;
1730         u16 chain;
1731         u64 next_group;
1732         struct inode *alloc_inode = ac->ac_inode;
1733         struct buffer_head *group_bh = NULL;
1734         struct buffer_head *prev_group_bh = NULL;
1735         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1736         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1737         struct ocfs2_group_desc *bg;
1738
1739         chain = ac->ac_chain;
1740         trace_ocfs2_search_chain_begin(
1741                 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
1742                 bits_wanted, chain);
1743
1744         status = ocfs2_read_group_descriptor(alloc_inode, fe,
1745                                              le64_to_cpu(cl->cl_recs[chain].c_blkno),
1746                                              &group_bh);
1747         if (status < 0) {
1748                 mlog_errno(status);
1749                 goto bail;
1750         }
1751         bg = (struct ocfs2_group_desc *) group_bh->b_data;
1752
1753         status = -ENOSPC;
1754         /* for now, the chain search is a bit simplistic. We just use
1755          * the 1st group with any empty bits. */
1756         while ((status = ac->ac_group_search(alloc_inode, group_bh,
1757                                              bits_wanted, min_bits,
1758                                              ac->ac_max_block,
1759                                              res)) == -ENOSPC) {
1760                 if (!bg->bg_next_group)
1761                         break;
1762
1763                 brelse(prev_group_bh);
1764                 prev_group_bh = NULL;
1765
1766                 next_group = le64_to_cpu(bg->bg_next_group);
1767                 prev_group_bh = group_bh;
1768                 group_bh = NULL;
1769                 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1770                                                      next_group, &group_bh);
1771                 if (status < 0) {
1772                         mlog_errno(status);
1773                         goto bail;
1774                 }
1775                 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1776         }
1777         if (status < 0) {
1778                 if (status != -ENOSPC)
1779                         mlog_errno(status);
1780                 goto bail;
1781         }
1782
1783         trace_ocfs2_search_chain_succ(
1784                 (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
1785
1786         res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1787
1788         BUG_ON(res->sr_bits == 0);
1789         if (!status)
1790                 ocfs2_bg_discontig_fix_result(ac, bg, res);
1791
1792         /*
1793          * sr_bg_blkno might have been changed by
1794          * ocfs2_bg_discontig_fix_result
1795          */
1796         res->sr_bg_stable_blkno = group_bh->b_blocknr;
1797
1798         /*
1799          * Keep track of previous block descriptor read. When
1800          * we find a target, if we have read more than X
1801          * number of descriptors, and the target is reasonably
1802          * empty, relink him to top of his chain.
1803          *
1804          * We've read 0 extra blocks and only send one more to
1805          * the transaction, yet the next guy to search has a
1806          * much easier time.
1807          *
1808          * Do this *after* figuring out how many bits we're taking out
1809          * of our target group.
1810          */
1811         if (!ac->ac_disable_chain_relink &&
1812             (prev_group_bh) &&
1813             (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1814                 status = ocfs2_relink_block_group(handle, alloc_inode,
1815                                                   ac->ac_bh, group_bh,
1816                                                   prev_group_bh, chain);
1817                 if (status < 0) {
1818                         mlog_errno(status);
1819                         goto bail;
1820                 }
1821         }
1822
1823         if (ac->ac_find_loc_only)
1824                 goto out_loc_only;
1825
1826         status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
1827                                                   ac->ac_bh, res->sr_bits,
1828                                                   chain);
1829         if (status) {
1830                 mlog_errno(status);
1831                 goto bail;
1832         }
1833
1834         status = ocfs2_block_group_set_bits(handle,
1835                                             alloc_inode,
1836                                             bg,
1837                                             group_bh,
1838                                             res->sr_bit_offset,
1839                                             res->sr_bits);
1840         if (status < 0) {
1841                 mlog_errno(status);
1842                 goto bail;
1843         }
1844
1845         trace_ocfs2_search_chain_end(
1846                         (unsigned long long)le64_to_cpu(fe->i_blkno),
1847                         res->sr_bits);
1848
1849 out_loc_only:
1850         *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1851 bail:
1852         brelse(group_bh);
1853         brelse(prev_group_bh);
1854
1855         if (status)
1856                 mlog_errno(status);
1857         return status;
1858 }
1859
1860 /* will give out up to bits_wanted contiguous bits. */
1861 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1862                                      handle_t *handle,
1863                                      u32 bits_wanted,
1864                                      u32 min_bits,
1865                                      struct ocfs2_suballoc_result *res)
1866 {
1867         int status;
1868         u16 victim, i;
1869         u16 bits_left = 0;
1870         u64 hint = ac->ac_last_group;
1871         struct ocfs2_chain_list *cl;
1872         struct ocfs2_dinode *fe;
1873
1874         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1875         BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1876         BUG_ON(!ac->ac_bh);
1877
1878         fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1879
1880         /* The bh was validated by the inode read during
1881          * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1882         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1883
1884         if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1885             le32_to_cpu(fe->id1.bitmap1.i_total)) {
1886                 ocfs2_error(ac->ac_inode->i_sb,
1887                             "Chain allocator dinode %llu has %u used "
1888                             "bits but only %u total.",
1889                             (unsigned long long)le64_to_cpu(fe->i_blkno),
1890                             le32_to_cpu(fe->id1.bitmap1.i_used),
1891                             le32_to_cpu(fe->id1.bitmap1.i_total));
1892                 status = -EIO;
1893                 goto bail;
1894         }
1895
1896         res->sr_bg_blkno = hint;
1897         if (res->sr_bg_blkno) {
1898                 /* Attempt to short-circuit the usual search mechanism
1899                  * by jumping straight to the most recently used
1900                  * allocation group. This helps us maintain some
1901                  * contiguousness across allocations. */
1902                 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1903                                                 min_bits, res, &bits_left);
1904                 if (!status)
1905                         goto set_hint;
1906                 if (status < 0 && status != -ENOSPC) {
1907                         mlog_errno(status);
1908                         goto bail;
1909                 }
1910         }
1911
1912         cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1913
1914         victim = ocfs2_find_victim_chain(cl);
1915         ac->ac_chain = victim;
1916
1917         status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1918                                     res, &bits_left);
1919         if (!status) {
1920                 hint = ocfs2_group_from_res(res);
1921                 goto set_hint;
1922         }
1923         if (status < 0 && status != -ENOSPC) {
1924                 mlog_errno(status);
1925                 goto bail;
1926         }
1927
1928         trace_ocfs2_claim_suballoc_bits(victim);
1929
1930         /* If we didn't pick a good victim, then just default to
1931          * searching each chain in order. Don't allow chain relinking
1932          * because we only calculate enough journal credits for one
1933          * relink per alloc. */
1934         ac->ac_disable_chain_relink = 1;
1935         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1936                 if (i == victim)
1937                         continue;
1938                 if (!cl->cl_recs[i].c_free)
1939                         continue;
1940
1941                 ac->ac_chain = i;
1942                 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1943                                             res, &bits_left);
1944                 if (!status) {
1945                         hint = ocfs2_group_from_res(res);
1946                         break;
1947                 }
1948                 if (status < 0 && status != -ENOSPC) {
1949                         mlog_errno(status);
1950                         goto bail;
1951                 }
1952         }
1953
1954 set_hint:
1955         if (status != -ENOSPC) {
1956                 /* If the next search of this group is not likely to
1957                  * yield a suitable extent, then we reset the last
1958                  * group hint so as to not waste a disk read */
1959                 if (bits_left < min_bits)
1960                         ac->ac_last_group = 0;
1961                 else
1962                         ac->ac_last_group = hint;
1963         }
1964
1965 bail:
1966         if (status)
1967                 mlog_errno(status);
1968         return status;
1969 }
1970
1971 int ocfs2_claim_metadata(handle_t *handle,
1972                          struct ocfs2_alloc_context *ac,
1973                          u32 bits_wanted,
1974                          u64 *suballoc_loc,
1975                          u16 *suballoc_bit_start,
1976                          unsigned int *num_bits,
1977                          u64 *blkno_start)
1978 {
1979         int status;
1980         struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
1981
1982         BUG_ON(!ac);
1983         BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1984         BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1985
1986         status = ocfs2_claim_suballoc_bits(ac,
1987                                            handle,
1988                                            bits_wanted,
1989                                            1,
1990                                            &res);
1991         if (status < 0) {
1992                 mlog_errno(status);
1993                 goto bail;
1994         }
1995         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
1996
1997         *suballoc_loc = res.sr_bg_blkno;
1998         *suballoc_bit_start = res.sr_bit_offset;
1999         *blkno_start = res.sr_blkno;
2000         ac->ac_bits_given += res.sr_bits;
2001         *num_bits = res.sr_bits;
2002         status = 0;
2003 bail:
2004         if (status)
2005                 mlog_errno(status);
2006         return status;
2007 }
2008
2009 static void ocfs2_init_inode_ac_group(struct inode *dir,
2010                                       struct buffer_head *parent_di_bh,
2011                                       struct ocfs2_alloc_context *ac)
2012 {
2013         struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
2014         /*
2015          * Try to allocate inodes from some specific group.
2016          *
2017          * If the parent dir has recorded the last group used in allocation,
2018          * cool, use it. Otherwise if we try to allocate new inode from the
2019          * same slot the parent dir belongs to, use the same chunk.
2020          *
2021          * We are very careful here to avoid the mistake of setting
2022          * ac_last_group to a group descriptor from a different (unlocked) slot.
2023          */
2024         if (OCFS2_I(dir)->ip_last_used_group &&
2025             OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
2026                 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
2027         else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
2028                 if (di->i_suballoc_loc)
2029                         ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
2030                 else
2031                         ac->ac_last_group = ocfs2_which_suballoc_group(
2032                                         le64_to_cpu(di->i_blkno),
2033                                         le16_to_cpu(di->i_suballoc_bit));
2034         }
2035 }
2036
2037 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
2038                                              struct ocfs2_alloc_context *ac)
2039 {
2040         OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
2041         OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
2042 }
2043
2044 int ocfs2_find_new_inode_loc(struct inode *dir,
2045                              struct buffer_head *parent_fe_bh,
2046                              struct ocfs2_alloc_context *ac,
2047                              u64 *fe_blkno)
2048 {
2049         int ret;
2050         handle_t *handle = NULL;
2051         struct ocfs2_suballoc_result *res;
2052
2053         BUG_ON(!ac);
2054         BUG_ON(ac->ac_bits_given != 0);
2055         BUG_ON(ac->ac_bits_wanted != 1);
2056         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2057
2058         res = kzalloc(sizeof(*res), GFP_NOFS);
2059         if (res == NULL) {
2060                 ret = -ENOMEM;
2061                 mlog_errno(ret);
2062                 goto out;
2063         }
2064
2065         ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2066
2067         /*
2068          * The handle started here is for chain relink. Alternatively,
2069          * we could just disable relink for these calls.
2070          */
2071         handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
2072         if (IS_ERR(handle)) {
2073                 ret = PTR_ERR(handle);
2074                 handle = NULL;
2075                 mlog_errno(ret);
2076                 goto out;
2077         }
2078
2079         /*
2080          * This will instruct ocfs2_claim_suballoc_bits and
2081          * ocfs2_search_one_group to search but save actual allocation
2082          * for later.
2083          */
2084         ac->ac_find_loc_only = 1;
2085
2086         ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
2087         if (ret < 0) {
2088                 mlog_errno(ret);
2089                 goto out;
2090         }
2091
2092         ac->ac_find_loc_priv = res;
2093         *fe_blkno = res->sr_blkno;
2094
2095 out:
2096         if (handle)
2097                 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
2098
2099         if (ret)
2100                 kfree(res);
2101
2102         return ret;
2103 }
2104
2105 int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2106                                  struct inode *dir,
2107                                  struct ocfs2_alloc_context *ac,
2108                                  u64 *suballoc_loc,
2109                                  u16 *suballoc_bit,
2110                                  u64 di_blkno)
2111 {
2112         int ret;
2113         u16 chain;
2114         struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
2115         struct buffer_head *bg_bh = NULL;
2116         struct ocfs2_group_desc *bg;
2117         struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2118
2119         /*
2120          * Since di_blkno is being passed back in, we check for any
2121          * inconsistencies which may have happened between
2122          * calls. These are code bugs as di_blkno is not expected to
2123          * change once returned from ocfs2_find_new_inode_loc()
2124          */
2125         BUG_ON(res->sr_blkno != di_blkno);
2126
2127         ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
2128                                           res->sr_bg_stable_blkno, &bg_bh);
2129         if (ret) {
2130                 mlog_errno(ret);
2131                 goto out;
2132         }
2133
2134         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
2135         chain = le16_to_cpu(bg->bg_chain);
2136
2137         ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
2138                                                ac->ac_bh, res->sr_bits,
2139                                                chain);
2140         if (ret) {
2141                 mlog_errno(ret);
2142                 goto out;
2143         }
2144
2145         ret = ocfs2_block_group_set_bits(handle,
2146                                          ac->ac_inode,
2147                                          bg,
2148                                          bg_bh,
2149                                          res->sr_bit_offset,
2150                                          res->sr_bits);
2151         if (ret < 0) {
2152                 mlog_errno(ret);
2153                 goto out;
2154         }
2155
2156         trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
2157                                            res->sr_bits);
2158
2159         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2160
2161         BUG_ON(res->sr_bits != 1);
2162
2163         *suballoc_loc = res->sr_bg_blkno;
2164         *suballoc_bit = res->sr_bit_offset;
2165         ac->ac_bits_given++;
2166         ocfs2_save_inode_ac_group(dir, ac);
2167
2168 out:
2169         brelse(bg_bh);
2170
2171         return ret;
2172 }
2173
2174 int ocfs2_claim_new_inode(handle_t *handle,
2175                           struct inode *dir,
2176                           struct buffer_head *parent_fe_bh,
2177                           struct ocfs2_alloc_context *ac,
2178                           u64 *suballoc_loc,
2179                           u16 *suballoc_bit,
2180                           u64 *fe_blkno)
2181 {
2182         int status;
2183         struct ocfs2_suballoc_result res;
2184
2185         BUG_ON(!ac);
2186         BUG_ON(ac->ac_bits_given != 0);
2187         BUG_ON(ac->ac_bits_wanted != 1);
2188         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2189
2190         ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2191
2192         status = ocfs2_claim_suballoc_bits(ac,
2193                                            handle,
2194                                            1,
2195                                            1,
2196                                            &res);
2197         if (status < 0) {
2198                 mlog_errno(status);
2199                 goto bail;
2200         }
2201         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2202
2203         BUG_ON(res.sr_bits != 1);
2204
2205         *suballoc_loc = res.sr_bg_blkno;
2206         *suballoc_bit = res.sr_bit_offset;
2207         *fe_blkno = res.sr_blkno;
2208         ac->ac_bits_given++;
2209         ocfs2_save_inode_ac_group(dir, ac);
2210         status = 0;
2211 bail:
2212         if (status)
2213                 mlog_errno(status);
2214         return status;
2215 }
2216
2217 /* translate a group desc. blkno and it's bitmap offset into
2218  * disk cluster offset. */
2219 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
2220                                                    u64 bg_blkno,
2221                                                    u16 bg_bit_off)
2222 {
2223         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2224         u32 cluster = 0;
2225
2226         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2227
2228         if (bg_blkno != osb->first_cluster_group_blkno)
2229                 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
2230         cluster += (u32) bg_bit_off;
2231         return cluster;
2232 }
2233
2234 /* given a cluster offset, calculate which block group it belongs to
2235  * and return that block offset. */
2236 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
2237 {
2238         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2239         u32 group_no;
2240
2241         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2242
2243         group_no = cluster / osb->bitmap_cpg;
2244         if (!group_no)
2245                 return osb->first_cluster_group_blkno;
2246         return ocfs2_clusters_to_blocks(inode->i_sb,
2247                                         group_no * osb->bitmap_cpg);
2248 }
2249
2250 /* given the block number of a cluster start, calculate which cluster
2251  * group and descriptor bitmap offset that corresponds to. */
2252 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
2253                                                 u64 data_blkno,
2254                                                 u64 *bg_blkno,
2255                                                 u16 *bg_bit_off)
2256 {
2257         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2258         u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
2259
2260         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2261
2262         *bg_blkno = ocfs2_which_cluster_group(inode,
2263                                               data_cluster);
2264
2265         if (*bg_blkno == osb->first_cluster_group_blkno)
2266                 *bg_bit_off = (u16) data_cluster;
2267         else
2268                 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
2269                                                              data_blkno - *bg_blkno);
2270 }
2271
2272 /*
2273  * min_bits - minimum contiguous chunk from this total allocation we
2274  * can handle. set to what we asked for originally for a full
2275  * contig. allocation, set to '1' to indicate we can deal with extents
2276  * of any size.
2277  */
2278 int __ocfs2_claim_clusters(handle_t *handle,
2279                            struct ocfs2_alloc_context *ac,
2280                            u32 min_clusters,
2281                            u32 max_clusters,
2282                            u32 *cluster_start,
2283                            u32 *num_clusters)
2284 {
2285         int status;
2286         unsigned int bits_wanted = max_clusters;
2287         struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2288         struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
2289
2290         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2291
2292         BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
2293                && ac->ac_which != OCFS2_AC_USE_MAIN);
2294
2295         if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2296                 WARN_ON(min_clusters > 1);
2297
2298                 status = ocfs2_claim_local_alloc_bits(osb,
2299                                                       handle,
2300                                                       ac,
2301                                                       bits_wanted,
2302                                                       cluster_start,
2303                                                       num_clusters);
2304                 if (!status)
2305                         atomic_inc(&osb->alloc_stats.local_data);
2306         } else {
2307                 if (min_clusters > (osb->bitmap_cpg - 1)) {
2308                         /* The only paths asking for contiguousness
2309                          * should know about this already. */
2310                         mlog(ML_ERROR, "minimum allocation requested %u exceeds "
2311                              "group bitmap size %u!\n", min_clusters,
2312                              osb->bitmap_cpg);
2313                         status = -ENOSPC;
2314                         goto bail;
2315                 }
2316                 /* clamp the current request down to a realistic size. */
2317                 if (bits_wanted > (osb->bitmap_cpg - 1))
2318                         bits_wanted = osb->bitmap_cpg - 1;
2319
2320                 status = ocfs2_claim_suballoc_bits(ac,
2321                                                    handle,
2322                                                    bits_wanted,
2323                                                    min_clusters,
2324                                                    &res);
2325                 if (!status) {
2326                         BUG_ON(res.sr_blkno); /* cluster alloc can't set */
2327                         *cluster_start =
2328                                 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
2329                                                                  res.sr_bg_blkno,
2330                                                                  res.sr_bit_offset);
2331                         atomic_inc(&osb->alloc_stats.bitmap_data);
2332                         *num_clusters = res.sr_bits;
2333                 }
2334         }
2335         if (status < 0) {
2336                 if (status != -ENOSPC)
2337                         mlog_errno(status);
2338                 goto bail;
2339         }
2340
2341         ac->ac_bits_given += *num_clusters;
2342
2343 bail:
2344         if (status)
2345                 mlog_errno(status);
2346         return status;
2347 }
2348
2349 int ocfs2_claim_clusters(handle_t *handle,
2350                          struct ocfs2_alloc_context *ac,
2351                          u32 min_clusters,
2352                          u32 *cluster_start,
2353                          u32 *num_clusters)
2354 {
2355         unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
2356
2357         return __ocfs2_claim_clusters(handle, ac, min_clusters,
2358                                       bits_wanted, cluster_start, num_clusters);
2359 }
2360
2361 static int ocfs2_block_group_clear_bits(handle_t *handle,
2362                                         struct inode *alloc_inode,
2363                                         struct ocfs2_group_desc *bg,
2364                                         struct buffer_head *group_bh,
2365                                         unsigned int bit_off,
2366                                         unsigned int num_bits,
2367                                         void (*undo_fn)(unsigned int bit,
2368                                                         unsigned long *bmap))
2369 {
2370         int status;
2371         unsigned int tmp;
2372         struct ocfs2_group_desc *undo_bg = NULL;
2373
2374         /* The caller got this descriptor from
2375          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
2376         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
2377
2378         trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
2379
2380         BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2381         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2382                                          group_bh,
2383                                          undo_fn ?
2384                                          OCFS2_JOURNAL_ACCESS_UNDO :
2385                                          OCFS2_JOURNAL_ACCESS_WRITE);
2386         if (status < 0) {
2387                 mlog_errno(status);
2388                 goto bail;
2389         }
2390
2391         if (undo_fn) {
2392                 jbd_lock_bh_state(group_bh);
2393                 undo_bg = (struct ocfs2_group_desc *)
2394                                         bh2jh(group_bh)->b_committed_data;
2395                 BUG_ON(!undo_bg);
2396         }
2397
2398         tmp = num_bits;
2399         while(tmp--) {
2400                 ocfs2_clear_bit((bit_off + tmp),
2401                                 (unsigned long *) bg->bg_bitmap);
2402                 if (undo_fn)
2403                         undo_fn(bit_off + tmp,
2404                                 (unsigned long *) undo_bg->bg_bitmap);
2405         }
2406         le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2407         if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2408                 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
2409                             " count %u but claims %u are freed. num_bits %d",
2410                             (unsigned long long)le64_to_cpu(bg->bg_blkno),
2411                             le16_to_cpu(bg->bg_bits),
2412                             le16_to_cpu(bg->bg_free_bits_count), num_bits);
2413                 return -EROFS;
2414         }
2415
2416         if (undo_fn)
2417                 jbd_unlock_bh_state(group_bh);
2418
2419         ocfs2_journal_dirty(handle, group_bh);
2420 bail:
2421         return status;
2422 }
2423
2424 /*
2425  * expects the suballoc inode to already be locked.
2426  */
2427 static int _ocfs2_free_suballoc_bits(handle_t *handle,
2428                                      struct inode *alloc_inode,
2429                                      struct buffer_head *alloc_bh,
2430                                      unsigned int start_bit,
2431                                      u64 bg_blkno,
2432                                      unsigned int count,
2433                                      void (*undo_fn)(unsigned int bit,
2434                                                      unsigned long *bitmap))
2435 {
2436         int status = 0;
2437         u32 tmp_used;
2438         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2439         struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2440         struct buffer_head *group_bh = NULL;
2441         struct ocfs2_group_desc *group;
2442
2443         /* The alloc_bh comes from ocfs2_free_dinode() or
2444          * ocfs2_free_clusters().  The callers have all locked the
2445          * allocator and gotten alloc_bh from the lock call.  This
2446          * validates the dinode buffer.  Any corruption that has happened
2447          * is a code bug. */
2448         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2449         BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2450
2451         trace_ocfs2_free_suballoc_bits(
2452                 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
2453                 (unsigned long long)bg_blkno,
2454                 start_bit, count);
2455
2456         status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2457                                              &group_bh);
2458         if (status < 0) {
2459                 mlog_errno(status);
2460                 goto bail;
2461         }
2462         group = (struct ocfs2_group_desc *) group_bh->b_data;
2463
2464         BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2465
2466         status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2467                                               group, group_bh,
2468                                               start_bit, count, undo_fn);
2469         if (status < 0) {
2470                 mlog_errno(status);
2471                 goto bail;
2472         }
2473
2474         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2475                                          alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2476         if (status < 0) {
2477                 mlog_errno(status);
2478                 goto bail;
2479         }
2480
2481         le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2482                      count);
2483         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2484         fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2485         ocfs2_journal_dirty(handle, alloc_bh);
2486
2487 bail:
2488         brelse(group_bh);
2489
2490         if (status)
2491                 mlog_errno(status);
2492         return status;
2493 }
2494
2495 int ocfs2_free_suballoc_bits(handle_t *handle,
2496                              struct inode *alloc_inode,
2497                              struct buffer_head *alloc_bh,
2498                              unsigned int start_bit,
2499                              u64 bg_blkno,
2500                              unsigned int count)
2501 {
2502         return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2503                                          start_bit, bg_blkno, count, NULL);
2504 }
2505
2506 int ocfs2_free_dinode(handle_t *handle,
2507                       struct inode *inode_alloc_inode,
2508                       struct buffer_head *inode_alloc_bh,
2509                       struct ocfs2_dinode *di)
2510 {
2511         u64 blk = le64_to_cpu(di->i_blkno);
2512         u16 bit = le16_to_cpu(di->i_suballoc_bit);
2513         u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2514
2515         if (di->i_suballoc_loc)
2516                 bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2517         return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2518                                         inode_alloc_bh, bit, bg_blkno, 1);
2519 }
2520
2521 static int _ocfs2_free_clusters(handle_t *handle,
2522                                 struct inode *bitmap_inode,
2523                                 struct buffer_head *bitmap_bh,
2524                                 u64 start_blk,
2525                                 unsigned int num_clusters,
2526                                 void (*undo_fn)(unsigned int bit,
2527                                                 unsigned long *bitmap))
2528 {
2529         int status;
2530         u16 bg_start_bit;
2531         u64 bg_blkno;
2532         struct ocfs2_dinode *fe;
2533
2534         /* You can't ever have a contiguous set of clusters
2535          * bigger than a block group bitmap so we never have to worry
2536          * about looping on them.
2537          * This is expensive. We can safely remove once this stuff has
2538          * gotten tested really well. */
2539         BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
2540
2541         fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
2542
2543         ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2544                                      &bg_start_bit);
2545
2546         trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
2547                         (unsigned long long)start_blk,
2548                         bg_start_bit, num_clusters);
2549
2550         status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2551                                            bg_start_bit, bg_blkno,
2552                                            num_clusters, undo_fn);
2553         if (status < 0) {
2554                 mlog_errno(status);
2555                 goto out;
2556         }
2557
2558         ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2559                                          num_clusters);
2560
2561 out:
2562         if (status)
2563                 mlog_errno(status);
2564         return status;
2565 }
2566
2567 int ocfs2_free_clusters(handle_t *handle,
2568                         struct inode *bitmap_inode,
2569                         struct buffer_head *bitmap_bh,
2570                         u64 start_blk,
2571                         unsigned int num_clusters)
2572 {
2573         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2574                                     start_blk, num_clusters,
2575                                     _ocfs2_set_bit);
2576 }
2577
2578 /*
2579  * Give never-used clusters back to the global bitmap.  We don't need
2580  * to protect these bits in the undo buffer.
2581  */
2582 int ocfs2_release_clusters(handle_t *handle,
2583                            struct inode *bitmap_inode,
2584                            struct buffer_head *bitmap_bh,
2585                            u64 start_blk,
2586                            unsigned int num_clusters)
2587 {
2588         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2589                                     start_blk, num_clusters,
2590                                     _ocfs2_clear_bit);
2591 }
2592
2593 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2594 {
2595         printk("Block Group:\n");
2596         printk("bg_signature:       %s\n", bg->bg_signature);
2597         printk("bg_size:            %u\n", bg->bg_size);
2598         printk("bg_bits:            %u\n", bg->bg_bits);
2599         printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
2600         printk("bg_chain:           %u\n", bg->bg_chain);
2601         printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
2602         printk("bg_next_group:      %llu\n",
2603                (unsigned long long)bg->bg_next_group);
2604         printk("bg_parent_dinode:   %llu\n",
2605                (unsigned long long)bg->bg_parent_dinode);
2606         printk("bg_blkno:           %llu\n",
2607                (unsigned long long)bg->bg_blkno);
2608 }
2609
2610 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
2611 {
2612         int i;
2613
2614         printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
2615         printk("i_signature:                  %s\n", fe->i_signature);
2616         printk("i_size:                       %llu\n",
2617                (unsigned long long)fe->i_size);
2618         printk("i_clusters:                   %u\n", fe->i_clusters);
2619         printk("i_generation:                 %u\n",
2620                le32_to_cpu(fe->i_generation));
2621         printk("id1.bitmap1.i_used:           %u\n",
2622                le32_to_cpu(fe->id1.bitmap1.i_used));
2623         printk("id1.bitmap1.i_total:          %u\n",
2624                le32_to_cpu(fe->id1.bitmap1.i_total));
2625         printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
2626         printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
2627         printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
2628         printk("id2.i_chain.cl_next_free_rec: %u\n",
2629                fe->id2.i_chain.cl_next_free_rec);
2630         for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
2631                 printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
2632                        fe->id2.i_chain.cl_recs[i].c_free);
2633                 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
2634                        fe->id2.i_chain.cl_recs[i].c_total);
2635                 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
2636                        (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
2637         }
2638 }
2639
2640 /*
2641  * For a given allocation, determine which allocators will need to be
2642  * accessed, and lock them, reserving the appropriate number of bits.
2643  *
2644  * Sparse file systems call this from ocfs2_write_begin_nolock()
2645  * and ocfs2_allocate_unwritten_extents().
2646  *
2647  * File systems which don't support holes call this from
2648  * ocfs2_extend_allocation().
2649  */
2650 int ocfs2_lock_allocators(struct inode *inode,
2651                           struct ocfs2_extent_tree *et,
2652                           u32 clusters_to_add, u32 extents_to_split,
2653                           struct ocfs2_alloc_context **data_ac,
2654                           struct ocfs2_alloc_context **meta_ac)
2655 {
2656         int ret = 0, num_free_extents;
2657         unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2658         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2659
2660         *meta_ac = NULL;
2661         if (data_ac)
2662                 *data_ac = NULL;
2663
2664         BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2665
2666         num_free_extents = ocfs2_num_free_extents(osb, et);
2667         if (num_free_extents < 0) {
2668                 ret = num_free_extents;
2669                 mlog_errno(ret);
2670                 goto out;
2671         }
2672
2673         /*
2674          * Sparse allocation file systems need to be more conservative
2675          * with reserving room for expansion - the actual allocation
2676          * happens while we've got a journal handle open so re-taking
2677          * a cluster lock (because we ran out of room for another
2678          * extent) will violate ordering rules.
2679          *
2680          * Most of the time we'll only be seeing this 1 cluster at a time
2681          * anyway.
2682          *
2683          * Always lock for any unwritten extents - we might want to
2684          * add blocks during a split.
2685          */
2686         if (!num_free_extents ||
2687             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2688                 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2689                 if (ret < 0) {
2690                         if (ret != -ENOSPC)
2691                                 mlog_errno(ret);
2692                         goto out;
2693                 }
2694         }
2695
2696         if (clusters_to_add == 0)
2697                 goto out;
2698
2699         ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2700         if (ret < 0) {
2701                 if (ret != -ENOSPC)
2702                         mlog_errno(ret);
2703                 goto out;
2704         }
2705
2706 out:
2707         if (ret) {
2708                 if (*meta_ac) {
2709                         ocfs2_free_alloc_context(*meta_ac);
2710                         *meta_ac = NULL;
2711                 }
2712
2713                 /*
2714                  * We cannot have an error and a non null *data_ac.
2715                  */
2716         }
2717
2718         return ret;
2719 }
2720
2721 /*
2722  * Read the inode specified by blkno to get suballoc_slot and
2723  * suballoc_bit.
2724  */
2725 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2726                                        u16 *suballoc_slot, u64 *group_blkno,
2727                                        u16 *suballoc_bit)
2728 {
2729         int status;
2730         struct buffer_head *inode_bh = NULL;
2731         struct ocfs2_dinode *inode_fe;
2732
2733         trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
2734
2735         /* dirty read disk */
2736         status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2737         if (status < 0) {
2738                 mlog(ML_ERROR, "read block %llu failed %d\n",
2739                      (unsigned long long)blkno, status);
2740                 goto bail;
2741         }
2742
2743         inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2744         if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2745                 mlog(ML_ERROR, "invalid inode %llu requested\n",
2746                      (unsigned long long)blkno);
2747                 status = -EINVAL;
2748                 goto bail;
2749         }
2750
2751         if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2752             (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2753                 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2754                      (unsigned long long)blkno,
2755                      (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2756                 status = -EINVAL;
2757                 goto bail;
2758         }
2759
2760         if (suballoc_slot)
2761                 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2762         if (suballoc_bit)
2763                 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2764         if (group_blkno)
2765                 *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
2766
2767 bail:
2768         brelse(inode_bh);
2769
2770         if (status)
2771                 mlog_errno(status);
2772         return status;
2773 }
2774
2775 /*
2776  * test whether bit is SET in allocator bitmap or not.  on success, 0
2777  * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
2778  * is returned and *res is meaningless.  Call this after you have
2779  * cluster locked against suballoc, or you may get a result based on
2780  * non-up2date contents
2781  */
2782 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2783                                    struct inode *suballoc,
2784                                    struct buffer_head *alloc_bh,
2785                                    u64 group_blkno, u64 blkno,
2786                                    u16 bit, int *res)
2787 {
2788         struct ocfs2_dinode *alloc_di;
2789         struct ocfs2_group_desc *group;
2790         struct buffer_head *group_bh = NULL;
2791         u64 bg_blkno;
2792         int status;
2793
2794         trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
2795                                       (unsigned int)bit);
2796
2797         alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
2798         if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
2799                 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2800                      (unsigned int)bit,
2801                      ocfs2_bits_per_group(&alloc_di->id2.i_chain));
2802                 status = -EINVAL;
2803                 goto bail;
2804         }
2805
2806         bg_blkno = group_blkno ? group_blkno :
2807                    ocfs2_which_suballoc_group(blkno, bit);
2808         status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2809                                              &group_bh);
2810         if (status < 0) {
2811                 mlog(ML_ERROR, "read group %llu failed %d\n",
2812                      (unsigned long long)bg_blkno, status);
2813                 goto bail;
2814         }
2815
2816         group = (struct ocfs2_group_desc *) group_bh->b_data;
2817         *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2818
2819 bail:
2820         brelse(group_bh);
2821
2822         if (status)
2823                 mlog_errno(status);
2824         return status;
2825 }
2826
2827 /*
2828  * Test if the bit representing this inode (blkno) is set in the
2829  * suballocator.
2830  *
2831  * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2832  *
2833  * In the event of failure, a negative value is returned and *res is
2834  * meaningless.
2835  *
2836  * Callers must make sure to hold nfs_sync_lock to prevent
2837  * ocfs2_delete_inode() on another node from accessing the same
2838  * suballocator concurrently.
2839  */
2840 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2841 {
2842         int status;
2843         u64 group_blkno = 0;
2844         u16 suballoc_bit = 0, suballoc_slot = 0;
2845         struct inode *inode_alloc_inode;
2846         struct buffer_head *alloc_bh = NULL;
2847
2848         trace_ocfs2_test_inode_bit((unsigned long long)blkno);
2849
2850         status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2851                                              &group_blkno, &suballoc_bit);
2852         if (status < 0) {
2853                 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2854                 goto bail;
2855         }
2856
2857         inode_alloc_inode =
2858                 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2859                                             suballoc_slot);
2860         if (!inode_alloc_inode) {
2861                 /* the error code could be inaccurate, but we are not able to
2862                  * get the correct one. */
2863                 status = -EINVAL;
2864                 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2865                      (u32)suballoc_slot);
2866                 goto bail;
2867         }
2868
2869         mutex_lock(&inode_alloc_inode->i_mutex);
2870         status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2871         if (status < 0) {
2872                 mutex_unlock(&inode_alloc_inode->i_mutex);
2873                 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2874                      (u32)suballoc_slot, status);
2875                 goto bail;
2876         }
2877
2878         status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2879                                          group_blkno, blkno, suballoc_bit, res);
2880         if (status < 0)
2881                 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2882
2883         ocfs2_inode_unlock(inode_alloc_inode, 0);
2884         mutex_unlock(&inode_alloc_inode->i_mutex);
2885
2886         iput(inode_alloc_inode);
2887         brelse(alloc_bh);
2888 bail:
2889         if (status)
2890                 mlog_errno(status);
2891         return status;
2892 }